### This works on the input columns only 

In [1]:
import pandas as pd
import numpy as np

### This model will be get used to impute missing values 

In [2]:
from sklearn.linear_model import LinearRegression

### After //10000 our range of the values for columns got reduced 

In [3]:
df = pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]//10000

In [4]:
df.shape

(50, 4)

In [5]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
Profit             0
dtype: int64

### Here in this data set we don't have 

In [6]:
df2 = df.sample(5)
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
46,0.0,11.0,29.0,4.0
43,1.0,12.0,3.0,6.0
32,6.0,12.0,4.0,9.0
4,14.0,9.0,36.0,16.0
27,7.0,12.0,35.0,10.0


In [7]:
df2.shape

(5, 4)

### Let's introduce some null values in our data set bcz we don't have any null values 
- First seperate the input col from output col

- We are using not null data set bcz we want to check how effective this technique is 

In [8]:
df3 = df2.iloc[:,0:-1]
df3.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
46,0.0,11.0,29.0
43,1.0,12.0,3.0
32,6.0,12.0,4.0
4,14.0,9.0,36.0
27,7.0,12.0,35.0


In [9]:
df3.iloc[1,0] = np.NaN
df3.iloc[3,1] = np.NaN
df3.iloc[-1,-1] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.iloc[1,0] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.iloc[3,1] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.iloc[-1,-1] = np.NaN


In [10]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
46,0.0,11.0,29.0
43,,12.0,3.0
32,6.0,12.0,4.0
4,14.0,,36.0
27,7.0,12.0,


## Step 1:
- Fill the null values with the mean of that column

#### First create a simple data frame that we will be using for the prediction of the first col 

In [11]:
df0 = pd.DataFrame()
df0['R&D'] = df3['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Adminstration'] = df3['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df3['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [12]:
df0

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,6.96,12.0,3.0
32,6.0,12.0,4.0
4,14.0,11.62,36.0
27,7.0,12.0,20.62


### Now let's remove the the imputed mean value from the first column 
### to predict the value of first column we will train our model on all the rows here value is not Null 

In [13]:
df1 = df0.copy()


# changing mean with nan value
df1.iloc[1,0] = np.NaN
df1

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,,12.0,3.0
32,6.0,12.0,4.0
4,14.0,11.62,36.0
27,7.0,12.0,20.62


### we are using only those row with non-null values 

In [14]:
x = df1.iloc[[0,2,3,4], 1:3]
x

Unnamed: 0,Adminstration,Marketing Spend
46,11.0,29.0
32,12.0,4.0
4,11.62,36.0
27,12.0,20.62


### Target variable is our Zero col so 

In [15]:
y = df1.iloc[[0,2,3,4], 0]
y

46     0.0
32     6.0
4     14.0
27     7.0
Name: R&D, dtype: float64

### Let's fit a linear Regression on the model 

In [16]:
lr = LinearRegression()
lr.fit(x,y)
lr.predict(df1.iloc[1,1:3].values.reshape(1,2))



array([3.75779595])

In [17]:
df1.iloc[1,0] = 18.05

In [18]:
df1

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,18.05,12.0,3.0
32,6.0,12.0,4.0
4,14.0,11.62,36.0
27,7.0,12.0,20.62


### Now perform the same steps with the second and third column 

In [19]:
df1.iloc[3,1] = np.NaN
x = df.iloc[[0,1,2,4], [0,2]]
x

Unnamed: 0,R&D Spend,Marketing Spend
0,16.0,47.0
1,16.0,44.0
2,15.0,40.0
4,14.0,36.0


In [20]:
y = df1.iloc[[0,1,2,4],1]
y

46    11.0
43    12.0
32    12.0
27    12.0
Name: Adminstration, dtype: float64

In [21]:
lr = LinearRegression()
lr.fit(x,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([12.])

In [22]:
df1.iloc[3,1] = 8
df1

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,18.05,12.0,3.0
32,6.0,12.0,4.0
4,14.0,8.0,36.0
27,7.0,12.0,20.62


### Performing for the third column now 

In [23]:
df1.iloc[-1,-1] = np.NaN
df1

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,18.05,12.0,3.0
32,6.0,12.0,4.0
4,14.0,8.0,36.0
27,7.0,12.0,


In [24]:
x = df1.iloc[0:4,:2]
x

Unnamed: 0,R&D,Adminstration
46,0.0,11.0
43,18.05,12.0
32,6.0,12.0
4,14.0,8.0


In [25]:
y = df1.iloc[0:4,2]
y

46    29.0
43     3.0
32     4.0
4     36.0
Name: Marketing Spend, dtype: float64

In [26]:
lr = LinearRegression()
lr.fit(x,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([9.70758025])

In [27]:
df1.iloc[-1,-1] = 23.32
df1

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,18.05,12.0,3.0
32,6.0,12.0,4.0
4,14.0,8.0,36.0
27,7.0,12.0,23.32


### Check for the loss 
- Subtract the df1 from df0 

In [28]:
df1- df0

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,0.0,0.0
43,11.09,0.0,0.0
32,0.0,0.0,0.0
4,0.0,-3.62,0.0
27,0.0,0.0,2.7


### Now again apply the previous steps and try to reduce the error or loss 

In [29]:
df2 = df1.copy()

In [30]:
# changing the previous predicted value with nan value
df2.iloc[1,0] = np.NaN
df2

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,,12.0,3.0
32,6.0,12.0,4.0
4,14.0,8.0,36.0
27,7.0,12.0,23.32


In [31]:
x = df2.iloc[[0,2,3,4], 1:3]
y = df2.iloc[[0,2,3,4], 0]
lr = LinearRegression()
lr.fit(x,y)
lr.predict(df2.iloc[1,1:3].values.reshape(1,2))



array([6.82101122])

In [32]:
df2.iloc[1,0] = 5.841
df2

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,5.841,12.0,3.0
32,6.0,12.0,4.0
4,14.0,8.0,36.0
27,7.0,12.0,23.32


## Second Column again 

In [33]:
df2.iloc[3,1] = np.NaN
x = df2.iloc[[0,1,2,4], [0,2]]
y = df1.iloc[[0,1,2,4],1]
lr = LinearRegression()
lr.fit(x,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([12.86158151])

In [34]:
df2.iloc[3,1] = 13.15
df2

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,5.841,12.0,3.0
32,6.0,12.0,4.0
4,14.0,13.15,36.0
27,7.0,12.0,23.32


## Third Column 

In [35]:
df2.iloc[-1,-1] = np.NaN
x = df1.iloc[0:4,:2]
y = df1.iloc[0:4,2]

lr = LinearRegression()
lr.fit(x,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([9.70758025])

In [36]:
df2.iloc[-1,-1] = 16.45
df2

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,11.0,29.0
43,5.841,12.0,3.0
32,6.0,12.0,4.0
4,14.0,13.15,36.0
27,7.0,12.0,16.45


### Again Check the error 

In [37]:
df2 - df1

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,0.0,0.0
43,-12.209,0.0,0.0
32,0.0,0.0,0.0
4,0.0,5.15,0.0
27,0.0,0.0,-6.87


In [39]:
df1 - df0

Unnamed: 0,R&D,Adminstration,Marketing Spend
46,0.0,0.0,0.0
43,11.09,0.0,0.0
32,0.0,0.0,0.0
4,0.0,-3.62,0.0
27,0.0,0.0,2.7


### We perform these task again and again untill error become zero or near to zero
### or we can run a specific number of iteration 

- Perform this untill desired results are found
- Stop when further iteration didn't effect the performance of the system
- After the prediction model, the new data set is serve as a base data for the next phase 