In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('50_Startups.csv', 
                 usecols=['R&D Spend', 'Administration', 'Marketing Spend','Profit'])

In [3]:
df = np.round(df/10000,2)

In [4]:
df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
49,0.0,11.7,4.52,1.47
1,16.26,15.14,44.39,19.18
43,1.55,12.74,3.55,6.98
30,6.2,11.56,9.11,9.99
5,13.19,9.98,36.29,15.7


In [5]:
df = df.iloc[:,0:-1]
df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
24,7.7,9.93,14.06
21,7.84,15.38,29.97
41,2.79,8.47,16.45
12,9.39,12.73,24.98
28,6.61,18.26,11.81


In [6]:
df.iloc[1,0] = np.NaN

In [7]:
df.iloc[3,1] = np.NaN

In [8]:
df.iloc[-1,-1]=np.NaN

In [9]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,,15.14,44.39
2,15.34,10.11,40.79
3,14.44,,38.32
4,14.21,9.14,36.62


In [10]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [11]:
df0.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,7.190408,15.14,44.39
2,15.34,10.11,40.79
3,14.44,12.139796,38.32
4,14.21,9.14,36.62


In [12]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,,15.14,44.39
2,15.34,10.11,40.79
3,14.44,12.139796,38.32
4,14.21,9.14,36.62


In [13]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
0,13.69,47.18
2,10.11,40.79
3,12.139796,38.32
4,9.14,36.62


In [14]:
y = df1.iloc[[0,2,3,4],0]
y

0    16.53
2    15.34
3    14.44
4    14.21
Name: R&D Spend, dtype: float64

In [15]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([15.68726303])

In [17]:
df1.iloc[1,0] = 15.68

In [18]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.68,15.14,44.39
2,15.34,10.11,40.79
3,14.44,12.139796,38.32
4,14.21,9.14,36.62
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [20]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN



In [21]:
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.68,15.14,44.39
2,15.34,10.11,40.79
3,14.44,,38.32
4,14.21,9.14,36.62


In [22]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
0,16.53,47.18
1,15.68,44.39
2,15.34,40.79
4,14.21,36.62


In [23]:
y = df1.iloc[[0,1,2,4],1]
y

0    13.69
1    15.14
2    10.11
4     9.14
Name: Administration, dtype: float64

In [24]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([11.35997682])

In [25]:
df1.iloc[3,1] = 11.36

In [26]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.68,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.36,38.32
4,14.21,9.14,36.62
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [27]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.68,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.36,38.32
4,14.21,9.14,
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [28]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
0,16.53,13.69
1,15.68,15.14
2,15.34,10.11
3,14.44,11.36


In [29]:
y = df1.iloc[0:4,-1]
y

0    47.18
1    44.39
2    40.79
3    38.32
Name: Marketing Spend, dtype: float64

In [30]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([36.29095748])

In [31]:
df1.iloc[4,-1] = 36.29

In [32]:
# After 1st Iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.68,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.36,38.32
4,14.21,9.14,36.29
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [33]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.36,38.32
4,14.21,9.14,36.29
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [34]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,8.489592,0.0,0.0
2,0.0,0.0,0.0
3,0.0,-0.779796,0.0
4,0.0,0.0,-0.33
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [35]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.36,38.32
4,14.21,9.14,36.29
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [36]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))



array([15.5185251])

In [39]:
df2.iloc[1,0] = 15.51

In [40]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([11.24353922])

In [41]:
df2.iloc[3,1] = 11.24

In [42]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([36.23258891])

In [43]:
df2.iloc[4,-1] = 36.23

In [44]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.51,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.24,38.32
4,14.21,9.14,36.23
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [45]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,-0.17,0.0,0.0
2,0.0,0.0,0.0
3,0.0,-0.12,0.0
4,0.0,0.0,-0.06
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [46]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.24,38.32
4,14.21,9.14,36.23
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [47]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))



array([15.48412155])

In [48]:
df3.iloc[1,0] = 15.48

In [49]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))



array([11.21923101])

In [50]:
df3.iloc[3,1] = 11.219

In [51]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))



array([36.22698461])

In [52]:
df3.iloc[4,-1] = 36.22

In [53]:
df2.iloc[3,1] = 11.219

In [54]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,16.53,13.69,47.18
1,15.48,15.14,44.39
2,15.34,10.11,40.79
3,14.44,11.219,38.32
4,14.21,9.14,36.22
5,13.19,9.98,36.29
6,13.46,14.72,12.77
7,13.03,14.55,32.39
8,12.05,14.87,31.16
9,12.33,10.87,30.5


In [55]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,-0.03,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,-0.01
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0
