### Liner Regression

This notebook focuses on fitting the data to a linear regressor and getting the results. We'll apply the model to all the original and processed dataset and compare the results.

In [1]:
import pandas as pd
import numpy as np
from process_data import *

df = pd.read_csv('dataset/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
#Handling NA for both numerical and categorical data.
cat_cols = (df.select_dtypes(include='object')).columns
df[cat_cols] = df[cat_cols].fillna('NA')

num_cols = (df.select_dtypes(exclude='object')).columns
df[num_cols] = df[num_cols].fillna(0)

df.drop(df.columns[0], axis=1, inplace=True)

###### Applying LR to unprocessed dataset first on the training data itself using test partition

In [3]:
#Convert categorical data using one hot encoding
feat1 = apply_one_hot_encoding(df, df)
df_non_ohe = df.select_dtypes(exclude='object')
feat2 = df_non_ohe.to_numpy()
final_feat = np.concatenate([feat1, feat2], axis=1)
X_train = final_feat[:,0:-1]
Y_train = final_feat[:,-1].reshape(-1,1)

In [4]:
#Scaling
from sklearn.preprocessing import StandardScaler
standardScalerX = StandardScaler()
X_train = standardScalerX.fit_transform(X_train)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.1, random_state=0)

In [6]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
y_pred = regressor.predict(X_test)

In [8]:
df_pred = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df_pred

Unnamed: 0,Actual,Predicted
0,200624.0,263950.625
1,133000.0,150526.625
2,110000.0,106670.625
3,192000.0,206430.625
4,88000.0,80702.625
...,...,...
141,145000.0,116094.625
142,217000.0,239294.625
143,150500.0,148798.625
144,108959.0,145918.625


In [9]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 591256424683216.9
Mean Squared Error: 5.103928731688196e+31
Root Mean Squared Error: 7144178561380025.0


In [10]:
df['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [11]:
df_pred['Predicted'].describe()

count    1.460000e+02
mean     5.912564e+14
std      7.144179e+15
min      3.129462e+04
25%      1.271026e+05
50%      1.613256e+05
75%      2.079866e+05
max      8.632344e+16
Name: Predicted, dtype: float64

In [12]:
df_tmp = df_pred[(df_pred < 10000000).all(axis=1)]

In [13]:
y_test = (df_tmp['Actual'].to_numpy()).reshape(-1,1)
y_pred = (df_tmp['Predicted'].to_numpy()).reshape(-1,1)

In [14]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 20869.64224137931
Mean Squared Error: 1166158960.4716594
Root Mean Squared Error: 34149.06968676686


In [15]:
df['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [16]:
#Calculating what factor of the average target value is the RMSE
34149/180931

0.18874045907003223

###### Applying LR to the IQR dataset and validating by partitioning the training dataset itself


In [66]:
X_train, y_train = process_df_iqr(df)
standardScaler_iqr = StandardScaler()
X_train = standardScaler_iqr.fit_transform(X_train)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [68]:
y_pred = regressor.predict(X_test)
df_pred = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df_pred

Unnamed: 0,Actual,Predicted
0,145900.0,159151.0
1,205000.0,193186.0
2,120000.0,112347.0
3,149900.0,146204.0
4,269790.0,249038.0
...,...,...
223,122000.0,132978.0
224,169500.0,165060.0
225,178000.0,177455.0
226,139000.0,123739.0


In [75]:
df_tmp = df_pred[(df_pred < 10000000).all(axis=1)]
df_tmp = df_tmp[(df_pred > 0).all(axis=1)]
y_test = (df_tmp['Actual'].to_numpy()).reshape(-1,1)
y_pred = (df_tmp['Predicted'].to_numpy()).reshape(-1,1)

  


In [76]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 13257.390134529149
Mean Squared Error: 322984142.50224215
Root Mean Squared Error: 17971.75958280775
