In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('50_Startups.csv')

df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df = pd.get_dummies(data=df, drop_first=True)

df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [4]:
X = df.drop(columns=['Profit'])
y = df.Profit

In [5]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [6]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)


In [8]:
lr = LinearRegression().fit(X_train, y_train)

In [9]:
y_pred = lr.predict(X_test)

In [10]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,103015.201598
1,132582.277608
2,132447.738452
3,71976.098513
4,178537.482211
5,116161.242302
6,67851.692097
7,98791.733747
8,113969.43533
9,167921.065696


In [11]:
y_test

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

In [12]:
import statsmodels.formula.api as sm

In [13]:
# This adds the row of 1 for the constant
X = pd.DataFrame(np.append(arr = np.ones((50,1)).astype(int), values= X, axis=1))

In [14]:
X.head()

Unnamed: 0,0,1,2,3,4,5
0,1.0,165349.2,136897.8,471784.1,0.0,1.0
1,1.0,162597.7,151377.59,443898.53,0.0,0.0
2,1.0,153441.51,101145.55,407934.54,1.0,0.0
3,1.0,144372.41,118671.85,383199.62,0.0,1.0
4,1.0,142107.34,91391.77,366168.42,1.0,0.0


In [15]:
X.rename(columns={0: 'Constant',
                  1: 'R&D Spend',
                  2: 'Administration',
                  3: 'Marketing Spend',
                  4: 'State_Florida',
                  5: 'State_New York',}, inplace=True)

In [16]:
X.head()

Unnamed: 0,Constant,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,1.0,165349.2,136897.8,471784.1,0.0,1.0
1,1.0,162597.7,151377.59,443898.53,0.0,0.0
2,1.0,153441.51,101145.55,407934.54,1.0,0.0
3,1.0,144372.41,118671.85,383199.62,0.0,1.0
4,1.0,142107.34,91391.77,366168.42,1.0,0.0


In [25]:
# Starting backwords elimination with P value < 0.05

In [17]:
X_opt = X[X.columns]

In [27]:
lr_OLS = sm.OLS(endog=y, exog=X_opt).fit()
lr_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Fri, 12 Jul 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,14:44:58,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Constant,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
R&D Spend,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [19]:
# P value was 0.990 dropping
X_opt.drop(columns=['State_New York'], inplace=True)

In [21]:
# P value was 0.940 dropping
X_opt.drop(columns=['State_Florida'], inplace=True)

In [23]:
# P value was 0.602 dropping
X_opt.drop(columns=['Administration'], inplace=True)

In [26]:
# P value was 0.060 dropping
X_opt.drop(columns=['Marketing Spend'], inplace=True)

In [28]:
# Looks like R&D Spend is the highest.
# Marketing Spend was very close at only 0.06 
# Might need to look into keeping it after looking at r^2