# MICE - Multivariate Imputation using chained Equation

Three types of missing values:
1) MCAR: Missing completely at random - Mice also works here, though simpler methods may suffice
2) MAR: Missing at random (not getting the data of the optional columns) - can be filled using the data of others
3) MNAR: Missing not at random (data delibrately removed)

MICE is used for MAR: The probability of missingness depends on observed data but not on the missing values themselves. For example, missing income might depend on observed age or education but not on the income itself.

The process looks something like this:
1) Replace all the missing values with the mean values.
2) Remove the missing values from col 1, use machine learning model to fill the missing values of that column using the other columns.
3) To the same for col2, 3, . . . . , n
4) The final result is called iteration 1, subtract this from iteration 0, the mean filled dataframe. The result is the difference df.
5) Continue this iteration until the difference dataframe approaches all 0 values.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Datasets/50_Startups.csv')

In [3]:
df.drop(columns=['State', 'Profit'], inplace = True)
np.random.seed(9)
df = df.sample(5)

In [4]:
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,44069.95,51283.14,197029.42
2,153441.51,101145.55,407934.54
14,119943.24,156547.42,256512.92
44,22177.74,154806.14,28334.72


In [5]:
df.isnull().mean()*100

R&D Spend          0.0
Administration     0.0
Marketing Spend    0.0
dtype: float64

No missing values currently, lets add some

In [6]:
df.shape

(5, 3)

In [7]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1, -1] = np.NaN

In [8]:
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,101145.55,407934.54
14,119943.24,,256512.92
44,22177.74,154806.14,


In [9]:
# Creating 0th iteration - Fill the NaN values with Mean of the column

df0 = df.copy()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [10]:
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,93487.99,51283.14,197029.42
2,153441.51,101145.55,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,290303.5425


In [11]:
# Now we will place NaN in col1 where we had a missing value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

In [12]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,101145.55,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,290303.5425


In [13]:
# Now we will use the columns 2 & 3 as input column to predict ouput col 1, training data will be row 0, 2,3,4

X = df1.iloc[[0,2,3,4], 1:3]
y = df1.iloc[[0,2,3,4], 0]

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X, y)

df1.iloc[1, 0] = lr.predict(df1.iloc[1, 1:].values.reshape(1,2))

df1



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,230217.874261,51283.14,197029.42
2,153441.51,101145.55,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,290303.5425


In [14]:
df1.iloc[2, 1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,230217.874261,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,290303.5425


In [15]:
X = df1.iloc[[0,1,3,4], [0,2]]
y = df1.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)
df1.iloc[2,1] = lr.predict(df1.iloc[2, [0,2]].values.reshape(1,2))

df1



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,230217.874261,51283.14,197029.42
2,153441.51,226864.218995,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,290303.5425


In [16]:
df1.iloc[-1,-1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,230217.874261,51283.14,197029.42
2,153441.51,226864.218995,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [17]:
X = df1.iloc[0:4, 0:2]
y = df1.iloc[0:4, -1]

lr = LinearRegression()
lr.fit(X, y)

df1.iloc[-1,-1] = lr.predict(df1.iloc[4, 0:2].values.reshape(1,2))

df1



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,230217.874261,51283.14,197029.42
2,153441.51,226864.218995,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,289245.052303


In [18]:
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,93487.99,51283.14,197029.42
2,153441.51,101145.55,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,290303.5425


In [19]:
# Now this df1 is the Iteration 1

# we will check for a difference dataframe

diff = df1 - df0
diff

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,136729.884261,0.0,0.0
2,0.0,125718.668995,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-1058.490197


In [20]:
# we need to bring the above non zero value close to zero through multiple iteration, so there are no more scope of the imputation improving using machine learning

# We will move onto the next iteration

In [21]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,226864.218995,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,289245.052303


In [22]:
X = df2.iloc[[0,2,3,4], 1:]
y = df2.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df2.iloc[1,0] = lr.predict(df2.iloc[1, 1:].values.reshape(1,2))

df2



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,242386.199832,51283.14,197029.42
2,153441.51,226864.218995,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,289245.052303


In [23]:
df2.iloc[2,1] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,242386.199832,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,289245.052303


In [24]:
X = df2.iloc[[0,1,3,4], [0,2]]
y = df2.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df2.iloc[2,1] = lr.predict(df2.iloc[2, [0,2]].values.reshape(1,2))
df2



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,242386.199832,51283.14,197029.42
2,153441.51,222607.821049,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,289245.052303


In [25]:
df2.iloc[-1,-1] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,242386.199832,51283.14,197029.42
2,153441.51,222607.821049,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [26]:
X = df2.iloc[0:-1, 0:2]
y = df2.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df2.iloc[-1,-1] = lr.predict(df2.iloc[-1, 0:2].values.reshape(1,2))
df2



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,242386.199832,51283.14,197029.42
2,153441.51,222607.821049,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288971.126438


In [27]:
# now with this we have completed our second iteration as well
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,12168.325571,0.0,0.0
2,0.0,-4256.397946,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-273.925865


In [28]:
df3 = df2.copy()

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,242386.199832,51283.14,197029.42
2,153441.51,222607.821049,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288971.126438


In [29]:
df3.iloc[1,0] = np.NaN
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,222607.821049,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288971.126438


In [30]:
X = df3.iloc[[0,2,3,4], 1:]
y = df3.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df3.iloc[1,0] = lr.predict(df3.iloc[1, 1:].values.reshape(1,2))

df3



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,244915.721624,51283.14,197029.42
2,153441.51,222607.821049,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288971.126438


In [31]:
df3.iloc[2,1] = np.NaN
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,244915.721624,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288971.126438


In [32]:
X = df3.iloc[[0,1,3,4], [0,2]]
y = df3.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df3.iloc[2,1] = lr.predict(df3.iloc[2, [0,2]].values.reshape(1,2))
df3



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,244915.721624,51283.14,197029.42
2,153441.51,221686.402914,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288971.126438


In [33]:
df3.iloc[-1,-1] = np.NaN
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,244915.721624,51283.14,197029.42
2,153441.51,221686.402914,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [34]:
X = df3.iloc[0:-1, 0:2]
y = df3.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df3.iloc[-1,-1] = lr.predict(df3.iloc[-1, 0:2].values.reshape(1,2))
df3



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,244915.721624,51283.14,197029.42
2,153441.51,221686.402914,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288920.071943


In [35]:
# now with this we have completed our Third iteration as well
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,2529.521792,0.0,0.0
2,0.0,-921.418135,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-51.054495


In [36]:
df4 = df3.copy()

df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,244915.721624,51283.14,197029.42
2,153441.51,221686.402914,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288920.071943


In [37]:
df4.iloc[1,0] = np.NaN
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,221686.402914,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288920.071943


In [38]:
X = df4.iloc[[0,2,3,4], 1:]
y = df4.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df4.iloc[1,0] = lr.predict(df4.iloc[1, 1:].values.reshape(1,2))

df4



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245368.741858,51283.14,197029.42
2,153441.51,221686.402914,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288920.071943


In [39]:
df4.iloc[2,1] = np.NaN
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245368.741858,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288920.071943


In [40]:
X = df4.iloc[[0,1,3,4], [0,2]]
y = df4.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df4.iloc[2,1] = lr.predict(df4.iloc[2, [0,2]].values.reshape(1,2))
df4



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245368.741858,51283.14,197029.42
2,153441.51,221520.123954,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288920.071943


In [41]:
df4.iloc[-1,-1] = np.NaN
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245368.741858,51283.14,197029.42
2,153441.51,221520.123954,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [42]:
X = df4.iloc[0:-1, 0:2]
y = df4.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df4.iloc[-1,-1] = lr.predict(df4.iloc[-1, 0:2].values.reshape(1,2))
df4



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245368.741858,51283.14,197029.42
2,153441.51,221520.123954,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288911.163141


In [43]:
# now with this we have completed our Fourth iteration as well
df4 - df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,453.020234,0.0,0.0
2,0.0,-166.27896,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-8.908802


In [44]:
df5 = df4.copy()

df5

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245368.741858,51283.14,197029.42
2,153441.51,221520.123954,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288911.163141


In [45]:
df5.iloc[1,0] = np.NaN
df5

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,221520.123954,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288911.163141


In [46]:
X = df5.iloc[[0,2,3,4], 1:]
y = df5.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df5.iloc[1,0] = lr.predict(df5.iloc[1, 1:].values.reshape(1,2))

df5



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245447.243821,51283.14,197029.42
2,153441.51,221520.123954,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288911.163141


In [47]:
df5.iloc[2,1] = np.NaN
df5

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245447.243821,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288911.163141


In [48]:
X = df5.iloc[[0,1,3,4], [0,2]]
y = df5.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df5.iloc[2,1] = lr.predict(df5.iloc[2, [0,2]].values.reshape(1,2))
df5



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245447.243821,51283.14,197029.42
2,153441.51,221491.271642,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288911.163141


In [49]:
df5.iloc[-1,-1] = np.NaN
df5

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245447.243821,51283.14,197029.42
2,153441.51,221491.271642,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [50]:
X = df5.iloc[0:-1, 0:2]
y = df5.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df5.iloc[-1,-1] = lr.predict(df5.iloc[-1, 0:2].values.reshape(1,2))
df5



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245447.243821,51283.14,197029.42
2,153441.51,221491.271642,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.62673


In [51]:
# now with this we have completed our Fifth iteration as well
df5 - df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,78.501963,0.0,0.0
2,0.0,-28.852311,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-1.53641


In [52]:
df6 = df5.copy()

df6

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245447.243821,51283.14,197029.42
2,153441.51,221491.271642,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.62673


In [53]:
df6.iloc[1,0] = np.NaN
df6

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,221491.271642,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.62673


In [54]:
X = df6.iloc[[0,2,3,4], 1:]
y = df6.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df6.iloc[1,0] = lr.predict(df6.iloc[1, 1:].values.reshape(1,2))

df6



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245460.766014,51283.14,197029.42
2,153441.51,221491.271642,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.62673


In [55]:
df6.iloc[2,1] = np.NaN
df6

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245460.766014,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.62673


In [56]:
X = df6.iloc[[0,1,3,4], [0,2]]
y = df6.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df6.iloc[2,1] = lr.predict(df6.iloc[2, [0,2]].values.reshape(1,2))
df6



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245460.766014,51283.14,197029.42
2,153441.51,221486.3006,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.62673


In [57]:
df6.iloc[-1,-1] = np.NaN
df6

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245460.766014,51283.14,197029.42
2,153441.51,221486.3006,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [58]:
X = df6.iloc[0:-1, 0:2]
y = df6.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df6.iloc[-1,-1] = lr.predict(df6.iloc[-1, 0:2].values.reshape(1,2))
df6



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245460.766014,51283.14,197029.42
2,153441.51,221486.3006,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.362299


In [59]:
# now with this we have completed our Sixth iteration as well
df6 - df5

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.522193,0.0,0.0
2,0.0,-4.971043,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-0.264431


In [60]:
df7 = df6.copy()

df7

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245460.766014,51283.14,197029.42
2,153441.51,221486.3006,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.362299


In [61]:
df7.iloc[1,0] = np.NaN
df7

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,221486.3006,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.362299


In [62]:
X = df7.iloc[[0,2,3,4], 1:]
y = df7.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df7.iloc[1,0] = lr.predict(df7.iloc[1, 1:].values.reshape(1,2))

df7



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.092835,51283.14,197029.42
2,153441.51,221486.3006,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.362299


In [63]:
df7.iloc[2,1] = np.NaN
df7

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.092835,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.362299


In [64]:
X = df7.iloc[[0,1,3,4], [0,2]]
y = df7.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df7.iloc[2,1] = lr.predict(df7.iloc[2, [0,2]].values.reshape(1,2))
df7



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.092835,51283.14,197029.42
2,153441.51,221485.445178,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.362299


In [65]:
df7.iloc[-1,-1] = np.NaN
df7

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.092835,51283.14,197029.42
2,153441.51,221485.445178,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [66]:
X = df7.iloc[0:-1, 0:2]
y = df7.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df7.iloc[-1,-1] = lr.predict(df7.iloc[-1, 0:2].values.reshape(1,2))
df7



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.092835,51283.14,197029.42
2,153441.51,221485.445178,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.316804


In [67]:
# now with this we have completed our Seventh iteration as well
df7 - df6

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,2.32682,0.0,0.0
2,0.0,-0.855422,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-0.045495


In [68]:
df8 = df7.copy()

df8

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.092835,51283.14,197029.42
2,153441.51,221485.445178,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.316804


In [69]:
df8.iloc[1,0] = np.NaN
df8

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,221485.445178,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.316804


In [70]:
X = df8.iloc[[0,2,3,4], 1:]
y = df8.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df8.iloc[1,0] = lr.predict(df8.iloc[1, 1:].values.reshape(1,2))

df8



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.493149,51283.14,197029.42
2,153441.51,221485.445178,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.316804


In [71]:
df8.iloc[2,1] = np.NaN
df8

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.493149,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.316804


In [72]:
X = df8.iloc[[0,1,3,4], [0,2]]
y = df8.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df8.iloc[2,1] = lr.predict(df8.iloc[2, [0,2]].values.reshape(1,2))
df8



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.493149,51283.14,197029.42
2,153441.51,221485.298007,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.316804


In [73]:
df8.iloc[-1,-1] = np.NaN
df8

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.493149,51283.14,197029.42
2,153441.51,221485.298007,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [74]:
X = df8.iloc[0:-1, 0:2]
y = df8.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df8.iloc[-1,-1] = lr.predict(df8.iloc[-1, 0:2].values.reshape(1,2))
df8



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.493149,51283.14,197029.42
2,153441.51,221485.298007,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.308977


In [75]:
# now with this we have completed our Eigth iteration as well
df8 - df7

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.400314,0.0,0.0
2,0.0,-0.147171,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-0.007827


In [76]:
df9 = df8.copy()

df9

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.493149,51283.14,197029.42
2,153441.51,221485.298007,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.308977


In [77]:
df9.iloc[1,0] = np.NaN
df9

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,,51283.14,197029.42
2,153441.51,221485.298007,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.308977


In [78]:
X = df9.iloc[[0,2,3,4], 1:]
y = df9.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df9.iloc[1,0] = lr.predict(df9.iloc[1, 1:].values.reshape(1,2))

df9



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.562018,51283.14,197029.42
2,153441.51,221485.298007,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.308977


In [79]:
df9.iloc[2,1] = np.NaN
df9

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.562018,51283.14,197029.42
2,153441.51,,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.308977


In [80]:
X = df9.iloc[[0,1,3,4], [0,2]]
y = df9.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df9.iloc[2,1] = lr.predict(df9.iloc[2, [0,2]].values.reshape(1,2))
df9



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.562018,51283.14,197029.42
2,153441.51,221485.272688,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.308977


In [81]:
df9.iloc[-1,-1] = np.NaN
df9

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.562018,51283.14,197029.42
2,153441.51,221485.272688,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,


In [82]:
X = df9.iloc[0:-1, 0:2]
y = df9.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df9.iloc[-1,-1] = lr.predict(df9.iloc[-1, 0:2].values.reshape(1,2))
df9



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78389.47,153773.43,299737.29
37,245463.562018,51283.14,197029.42
2,153441.51,221485.272688,407934.54
14,119943.24,115252.065,256512.92
44,22177.74,154806.14,288909.30763


In [83]:
# now with this we have completed our Ninth iteration as well
df9 - df8

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.068869,0.0,0.0
2,0.0,-0.025319,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-0.001347


In [84]:
# I dont why i did not think of doing the whole iteration in one step

df10 = df9.copy()

df10.iloc[1,0] = np.NaN

X = df10.iloc[[0,2,3,4], 1:]
y = df10.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df10.iloc[1,0] = lr.predict(df10.iloc[1, 1:].values.reshape(1,2))

df10.iloc[2,1] = np.NaN

X = df10.iloc[[0,1,3,4], [0,2]]
y = df10.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df10.iloc[2,1] = lr.predict(df10.iloc[2, [0,2]].values.reshape(1,2))

df10.iloc[-1,-1] = np.NaN

X = df10.iloc[0:-1, 0:2]
y = df10.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df10.iloc[-1,-1] = lr.predict(df10.iloc[-1, 0:2].values.reshape(1,2))

# now with this we have completed our Tenth iteration as well
df10 - df9



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.011848,0.0,0.0
2,0.0,-0.004356,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-0.000232


In [85]:
df11 = df10.copy()

df11.iloc[1,0] = np.NaN

X = df11.iloc[[0,2,3,4], 1:]
y = df11.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df11.iloc[1,0] = lr.predict(df11.iloc[1, 1:].values.reshape(1,2))

df11.iloc[2,1] = np.NaN

X = df11.iloc[[0,1,3,4], [0,2]]
y = df11.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df11.iloc[2,1] = lr.predict(df11.iloc[2, [0,2]].values.reshape(1,2))

df11.iloc[-1,-1] = np.NaN

X = df11.iloc[0:-1, 0:2]
y = df11.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df11.iloc[-1,-1] = lr.predict(df11.iloc[-1, 0:2].values.reshape(1,2))

# now with this we have completed our Eleventh iteration as well
df11 - df10



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.002038,0.0,0.0
2,0.0,-0.000749,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-4e-05


In [86]:
df12 = df11.copy()

df12.iloc[1,0] = np.NaN

X = df12.iloc[[0,2,3,4], 1:]
y = df12.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df12.iloc[1,0] = lr.predict(df12.iloc[1, 1:].values.reshape(1,2))

df12.iloc[2,1] = np.NaN

X = df12.iloc[[0,1,3,4], [0,2]]
y = df12.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df12.iloc[2,1] = lr.predict(df12.iloc[2, [0,2]].values.reshape(1,2))

df12.iloc[-1,-1] = np.NaN

X = df12.iloc[0:-1, 0:2]
y = df12.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df12.iloc[-1,-1] = lr.predict(df12.iloc[-1, 0:2].values.reshape(1,2))

# now with this we have completed our Twelth iteration as well
df12 - df11



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.000351,0.0,0.0
2,0.0,-0.000129,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-7e-06


In [87]:
df13 = df12.copy()

df13.iloc[1,0] = np.NaN

X = df13.iloc[[0,2,3,4], 1:]
y = df13.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df13.iloc[1,0] = lr.predict(df13.iloc[1, 1:].values.reshape(1,2))

df13.iloc[2,1] = np.NaN

X = df13.iloc[[0,1,3,4], [0,2]]
y = df13.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df13.iloc[2,1] = lr.predict(df13.iloc[2, [0,2]].values.reshape(1,2))

df13.iloc[-1,-1] = np.NaN

X = df13.iloc[0:-1, 0:2]
y = df13.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df13.iloc[-1,-1] = lr.predict(df13.iloc[-1, 0:2].values.reshape(1,2))

# now with this we have completed our Thirteenth iteration as well
df13 - df12



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,6e-05,0.0,0.0
2,0.0,-2.2e-05,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-1e-06


In [88]:
# We are very close to zero, one last try

df14 = df13.copy()

df14.iloc[1,0] = np.NaN

X = df14.iloc[[0,2,3,4], 1:]
y = df14.iloc[[0,2,3,4], 0]

lr = LinearRegression()
lr.fit(X, y)
df14.iloc[1,0] = lr.predict(df14.iloc[1, 1:].values.reshape(1,2))

df14.iloc[2,1] = np.NaN

X = df14.iloc[[0,1,3,4], [0,2]]
y = df14.iloc[[0,1,3,4], 1]

lr = LinearRegression()
lr.fit(X, y)

df14.iloc[2,1] = lr.predict(df14.iloc[2, [0,2]].values.reshape(1,2))

df14.iloc[-1,-1] = np.NaN

X = df14.iloc[0:-1, 0:2]
y = df14.iloc[0:-1, 2]

lr = LinearRegression()
lr.fit(X, y)

df14.iloc[-1,-1] = lr.predict(df14.iloc[-1, 0:2].values.reshape(1,2))

# now with this we have completed our Fourteenth iteration as well
df14 - df13



Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,1e-05,0.0,0.0
2,0.0,-4e-06,0.0
14,0.0,0.0,0.0
44,0.0,0.0,-2.029701e-07


In [None]:
# This seems ideal