In [21]:
import pandas  as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.decomposition import PCA

In [22]:
hp_df = pd.read_csv('house_price_data.csv') # dataframe(equivalent to table)

y = hp_df['SalePrice']

hp_df.drop(columns = ["SalePrice",'Id'],inplace = True)

In [23]:
cat_cols = [col for col in hp_df.columns if hp_df[col].dtype == 'object']
con_cols = [col for col in hp_df.columns if hp_df[col].dtype != 'object']

In [24]:
X_train,X_test,y_train,y_test = train_test_split(hp_df,y,random_state = 42, test_size = 0.2)

In [25]:
"""
Fill the NAs
Scaling the continuous features
Encoding the categorical features

"""

'\nFill the NAs\nScaling the continuous features\nEncoding the categorical features\n\n'

In [26]:
for col in X_train.columns:
    if hp_df[col].dtype == 'object':
        X_train[col].fillna(X_train[col].mode()[0],inplace = True)
        X_test[col].fillna(X_train[col].mode()[0],inplace = True) #filling with train mode
        
    if hp_df[col].dtype != 'object':
        X_train[col].fillna(X_train[col].mean(),inplace = True)
        X_test[col].fillna(X_train[col].mean(),inplace = True) #filling with train mode
        
        

In [27]:
# Standard Scaler

scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

for col in con_cols:
    X_train[col] = min_max_scaler.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col] = min_max_scaler.transform(np.array(X_test[col]).reshape(-1,1))

In [28]:
X_train[con_cols].index

Index([ 254, 1066,  638,  799,  380,  303,   86, 1385,  265,  793,
       ...
        330, 1238,  466,  121, 1044, 1095, 1130, 1294,  860, 1126],
      dtype='int64', length=1168)

In [29]:
# One-Hot Encoding

oe = OneHotEncoder(handle_unknown='ignore')

oe_train = pd.DataFrame(oe.fit_transform(X_train[cat_cols]).toarray(),index = X_train[con_cols].index)
oe_test = pd.DataFrame(oe.transform(X_test[cat_cols]).toarray(),index = X_test[con_cols].index)

In [30]:
train_final = pd.concat([X_train[con_cols],oe_train],axis = 1)
test_final = pd.concat([X_test[con_cols],oe_test],axis = 1)

In [46]:
pca = PCA()
train_final.columns = train_final.columns.astype(str)
test_final.columns = test_final.columns.astype(str)

pca_data = pd.DataFrame(pca.fit_transform(train_final),columns = [f'pc_{i}' for i in range(train_final.shape[1])])
pca_data.iloc[:,:150]

Unnamed: 0,pc_0,pc_1,pc_2,pc_3,pc_4,pc_5,pc_6,pc_7,pc_8,pc_9,...,pc_140,pc_141,pc_142,pc_143,pc_144,pc_145,pc_146,pc_147,pc_148,pc_149
0,-1.540895,-0.850223,-0.771624,-0.864752,-0.331433,0.060157,0.296597,0.979207,0.287200,0.273279,...,-0.005323,-0.017780,0.063305,-0.018636,-0.017451,-0.009389,0.004929,-0.007735,-0.017911,0.069703
1,1.362622,-0.076478,1.569882,-0.207343,-0.771728,1.024823,-0.241543,-0.277348,-0.118839,-0.089815,...,0.006230,0.037575,0.006055,0.013007,-0.022016,0.062071,0.029783,-0.057476,0.015935,-0.005418
2,-1.664255,0.292307,-0.525654,-0.169458,-0.697426,0.329934,0.120448,-0.373653,-0.649086,-0.071712,...,-0.125558,0.018355,0.115259,-0.015916,0.011044,0.036257,-0.073762,0.044586,0.002206,-0.104092
3,-0.663011,0.682286,1.223670,0.399661,-0.345690,-0.478269,1.222104,-0.411722,-0.229501,-0.022093,...,-0.092957,0.027532,-0.028735,0.038542,-0.006118,-0.003613,-0.032968,-0.067395,-0.074694,-0.022914
4,-1.457929,1.048493,0.122868,-0.057240,-0.153243,-0.500387,0.526763,0.176391,-0.032248,0.444287,...,-0.022820,0.003057,0.014581,0.014211,-0.022444,-0.099733,-0.083174,0.041483,-0.042400,-0.098172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,1.865473,-0.316055,-0.165147,-0.432805,-1.323172,-0.561069,-0.035993,-0.049796,-0.012977,-0.054718,...,0.017883,-0.040673,0.012388,-0.006212,-0.008566,0.016766,0.045062,0.033901,0.020580,-0.055018
1164,-1.274578,0.360190,0.755492,-0.247613,0.141571,-0.117174,0.756118,-0.115869,-0.000143,0.098393,...,-0.189398,-0.179251,-0.025235,0.191625,0.056871,-0.005953,-0.002508,-0.008373,-0.168328,-0.096099
1165,-1.906507,-0.158417,-0.923412,0.060016,0.259858,0.294243,0.293268,-0.198255,0.249462,-0.508548,...,-0.120508,-0.114747,0.014879,0.123016,-0.008962,0.097732,0.011930,-0.109818,-0.004415,-0.085518
1166,-0.783620,1.353220,0.501140,0.873846,-0.763231,-0.472255,0.943996,0.501530,-0.692745,0.416277,...,0.023423,0.035891,-0.005837,0.035250,0.006579,0.065410,0.004664,0.029306,0.018793,-0.031640


In [51]:
pca_data_test = pd.DataFrame(pca.transform(test_final),columns = [f'pc_{i}' for i in range(test_final.shape[1])])
pca_data_test.iloc[:,:150]
# linreg = LinearRegression()

# linreg.fit(train_final,y_train)

Unnamed: 0,pc_0,pc_1,pc_2,pc_3,pc_4,pc_5,pc_6,pc_7,pc_8,pc_9,...,pc_140,pc_141,pc_142,pc_143,pc_144,pc_145,pc_146,pc_147,pc_148,pc_149
0,-1.036685,-1.353803,-0.810504,-0.696137,0.334058,0.542591,0.095205,0.225679,-0.152809,0.009011,...,0.087537,0.004932,0.100903,-0.041196,-0.027191,-0.015485,0.049546,-0.071948,0.005234,0.039702
1,1.797633,-0.970827,1.453888,0.523640,0.489673,-0.618909,0.212187,0.381354,-0.398135,0.517930,...,0.103287,-0.047261,-0.000392,-0.077061,0.004096,-0.006339,0.037546,0.028396,-0.032457,-0.057091
2,-2.083479,1.072699,-0.407249,-0.080388,-0.319891,0.111721,0.068602,-0.107758,-0.104139,-0.319603,...,-0.059796,-0.093809,-0.037775,0.109271,-0.022218,0.180933,0.030290,-0.158302,0.073145,0.028329
3,-1.742639,0.826317,0.209404,0.437254,0.056960,-0.331316,0.277828,0.251736,0.682063,-0.067149,...,0.034947,-0.028425,-0.056620,0.002886,-0.198191,0.006891,-0.041000,0.019347,0.131627,-0.057137
4,2.168661,-0.879443,-1.139004,1.331555,0.974712,-0.064908,0.533138,-0.896160,0.400376,-0.127720,...,-0.133639,-0.028306,-0.040945,-0.007460,0.050300,0.127736,0.010895,0.059431,0.052789,-0.046248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,-1.437220,0.902377,-0.963150,0.358032,0.838973,-0.021134,-0.052596,-0.383965,-0.180083,-0.573011,...,0.082936,0.146502,0.205038,-0.166692,0.075451,0.222669,0.138579,-0.035817,0.150204,0.044115
288,2.026877,-0.705260,-1.175035,1.200308,0.376408,-0.520162,0.349308,-0.392566,0.376933,0.023224,...,0.052209,-0.069984,0.054680,0.044714,-0.121613,0.098051,0.167124,0.204211,0.019726,-0.012697
289,1.824676,0.264328,0.292331,-0.630030,0.624294,-0.591496,0.272730,-0.244126,0.106332,0.578055,...,-0.034493,-0.080059,-0.064616,0.096798,0.017322,0.032592,-0.052366,-0.108435,-0.042225,0.016879
290,-1.992042,0.882311,0.307942,0.056878,0.489262,-0.062334,-0.353165,0.301994,0.516052,0.296297,...,0.285686,-0.044888,-0.293077,0.029427,-0.165075,0.048479,0.168243,0.218409,-0.340428,-0.086994


In [None]:
coefficients = linreg.coef_

In [None]:
y_pred = linreg.predict(test_final)

In [None]:
r2_score(y_test,y_pred)

In [None]:
dtree = DecisionTreeRegressor(random_state=42)

dtree.fit(train_final,y_train)

dtree_pred = dtree.predict(test_final)

In [None]:
r2_score(y_test,dtree_pred)

In [None]:
rf_regressor = RandomForestRegressor()

In [None]:
# GridSearchCV
# RandomisedSearchCV
rf_param_dict = {
                'n_estimators': [40,60,70,80],
                'max_depth' : [10,12,14],
                'min_samples_split': [5,6,7,8],
                'max_leaf_nodes': [6,7,8,9]
                }

gcv = GridSearchCV(rf_regressor,param_grid=rf_param_dict,cv = 5,verbose =32)

gcv.fit(train_final,y_train)


In [None]:
gcv.best_params_

gcv.best_score_

In [None]:
# GridSearchCV
# RandomisedSearchCV
rf_param_dict = {
                'n_estimators': [40,60,70,80],
                'max_depth' : [10,12,14],
                'min_samples_split': [5,6,7,8],
                'max_leaf_nodes': [6,7,8,9]
                }

rcv = RandomizedSearchCV(rf_regressor,param_grid=rf_param_dict,cv = 5,verbose =32,n_iter = 100)

rcv.fit(train_final,y_train)