In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression


In [2]:
data=pd.read_csv('data/50_Startups.csv')
df=np.round(pd.read_csv('data/50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)
df=df.sample(5)
df



Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [3]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [4]:
df0=pd.DataFrame()
df0['R&D Spend']=df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration']=df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend']=df['Marketing Spend'].fillna(df['Marketing Spend'].mean())
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [5]:
# 0th Iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [6]:
df1=df0.copy()

df1.iloc[1,0]=np.nan
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [7]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,16.0,26.0
44,15.0,3.0


In [8]:
y = df1.iloc[[0,2,3,4],0]
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [12]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([7.92175573])

In [14]:
df1.iloc[1,0] =7.92

In [15]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,7.92,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [17]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.nan

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,7.92,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,3.0


In [18]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,7.92,20.0
2,15.0,41.0
44,2.0,3.0


In [19]:
y = df1.iloc[[0,1,2,4],1]
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [20]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([5.38292774])

In [21]:
df1.iloc[3,1] = 5.38

In [22]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,7.92,5.0,20.0
2,15.0,10.0,41.0
14,12.0,5.38,26.0
44,2.0,15.0,3.0


In [23]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.nan

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,7.92,5.0,20.0
2,15.0,10.0,41.0
14,12.0,5.38,26.0
44,2.0,15.0,


In [24]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,7.92,5.0
2,15.0,10.0
14,12.0,5.38


In [25]:
y = df1.iloc[0:4,-1]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [26]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([17.47809377])

In [27]:
df1.iloc[4,-1] = 17.47

In [28]:
# After 1st Iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,7.92,5.0,20.0
2,15.0,10.0,41.0
14,12.0,5.38,26.0
44,2.0,15.0,17.47


In [29]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,3.92,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-10.62,0.0
44,0.0,0.0,14.47


In [30]:
df2 = df1.copy()

df2.iloc[1,0] = np.nan

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,5.38,26.0
44,2.0,15.0,17.47


In [31]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))



array([9.5660856])

In [32]:
df2.iloc[1,0] = 9.56

In [33]:
df2.iloc[3,1] = np.nan
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([5.02613236])

In [34]:
df2.iloc[3,1] = 5.02

In [35]:
df2.iloc[4,-1] = np.nan

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([14.61246564])

In [36]:
df2.iloc[4,-1] = 14.61

In [37]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.56,5.0,20.0
2,15.0,10.0,41.0
14,12.0,5.02,26.0
44,2.0,15.0,14.61


In [38]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,1.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.36,0.0
44,0.0,0.0,-2.86


In [39]:
df3 = df2.copy()

df3.iloc[1,0] = np.nan

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,5.02,26.0
44,2.0,15.0,14.61


In [40]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))



array([9.67886621])

In [41]:

df3.iloc[1,0] = 9.67

In [42]:
df3.iloc[3,1] = np.nan
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))



array([4.97490139])

In [43]:
df3.iloc[3,1] = 4.97

In [44]:
df3.iloc[4,-1] = np.nan

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))



array([14.42878377])

In [45]:
df3.iloc[4,-1] = 14.28

In [46]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.67,5.0,20.0
2,15.0,10.0,41.0
14,12.0,4.97,26.0
44,2.0,15.0,14.28


In [47]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.11,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.05,0.0
44,0.0,0.0,-0.33
