In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [3]:
data_url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/50_Startups.csv'
df = pd.read_csv(data_url)

In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [6]:
x = df.iloc[:,:4]
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [7]:
y = df['Profit']

In [8]:
state_enc = OneHotEncoder(drop='first')
dummy_state=state_enc.fit_transform(x[['State']]).toarray()

In [9]:
x=x.drop(columns=['State'])

In [10]:
x= pd.concat([x,pd.DataFrame(dummy_state)],axis=1)

In [11]:
scalar =  StandardScaler()
x= scalar.fit_transform(x)



In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=.2,random_state=1)

In [13]:
model = LinearRegression()

In [14]:
model.fit(xtrain,ytrain)

LinearRegression()

In [15]:
ypred = model.predict(xtest)

In [16]:
mean_squared_error(ytest,ypred)

79495441.50411

In [17]:
mean_absolute_error(ytest,ypred)

7698.119817484754

In [18]:
model.score(xtest,ytest)*100

96.49618042060467

In [19]:
from joblib import dump

In [20]:
dump({
    'state_hot_encoder':state_enc,
    'scalar': scalar,
    'model':model
},'startup_model_v1.jb')

['startup_model_v1.jb']

In [21]:
import numpy as np

In [22]:
admin = 165464
rnd = 12545
mkt = 99854

# make it in 2D array
inp = np.array([[admin,rnd,mkt]])
inp

array([[165464,  12545,  99854]])

In [23]:
state = 'Florida'
inp_d = state_enc.transform([[state]]).toarray()



In [24]:
inp_f = np.hstack([inp,inp_d])

In [25]:
inp_f = scalar.transform(inp_f)

In [26]:
model.predict(inp_f)

array([181027.32214919])