In [7]:
import pandas as pd

In [13]:
df_train = pd.read_csv('../../../datasets/regtrain.csv')
df_test = pd.read_csv('../../../datasets/regtest.csv')

In [14]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Missing Values

In [15]:
df_train.isna().sum().to_dict()

{'Id': 0,
 'MSSubClass': 0,
 'MSZoning': 0,
 'LotFrontage': 259,
 'LotArea': 0,
 'Street': 0,
 'Alley': 1369,
 'LotShape': 0,
 'LandContour': 0,
 'Utilities': 0,
 'LotConfig': 0,
 'LandSlope': 0,
 'Neighborhood': 0,
 'Condition1': 0,
 'Condition2': 0,
 'BldgType': 0,
 'HouseStyle': 0,
 'OverallQual': 0,
 'OverallCond': 0,
 'YearBuilt': 0,
 'YearRemodAdd': 0,
 'RoofStyle': 0,
 'RoofMatl': 0,
 'Exterior1st': 0,
 'Exterior2nd': 0,
 'MasVnrType': 8,
 'MasVnrArea': 8,
 'ExterQual': 0,
 'ExterCond': 0,
 'Foundation': 0,
 'BsmtQual': 37,
 'BsmtCond': 37,
 'BsmtExposure': 38,
 'BsmtFinType1': 37,
 'BsmtFinSF1': 0,
 'BsmtFinType2': 38,
 'BsmtFinSF2': 0,
 'BsmtUnfSF': 0,
 'TotalBsmtSF': 0,
 'Heating': 0,
 'HeatingQC': 0,
 'CentralAir': 0,
 'Electrical': 1,
 '1stFlrSF': 0,
 '2ndFlrSF': 0,
 'LowQualFinSF': 0,
 'GrLivArea': 0,
 'BsmtFullBath': 0,
 'BsmtHalfBath': 0,
 'FullBath': 0,
 'HalfBath': 0,
 'BedroomAbvGr': 0,
 'KitchenAbvGr': 0,
 'KitchenQual': 0,
 'TotRmsAbvGrd': 0,
 'Functional': 0,
 'Fir

In [135]:
df_train=df_train[df_train['SalePrice']<487000]

In [181]:
df_train=df_train[df_train['SalePrice']<478000]

In [182]:
columns_numeric = list(df_train.select_dtypes(exclude='object').columns)
columns_numeric.remove('Id')
columns_numeric.remove('SalePrice')
columns_categorical = list(df_train.select_dtypes(include='object').columns)

In [183]:
from sklearn.impute import SimpleImputer
imputer_numeric = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

imputer_numeric.fit(df_train[columns_numeric]) # median of every column
imputer_cat.fit(df_train[columns_categorical]) # mode of every column

df_train[columns_numeric] = imputer_numeric.transform(df_train[columns_numeric]) # fills missing data
df_train[columns_categorical] = imputer_cat.transform(df_train[columns_categorical]) # fill

In [184]:
df_train.isna().sum().sum()

0

In [185]:
df_test[columns_numeric] = imputer_numeric.transform(df_test[columns_numeric]) # fills missing data
df_test[columns_categorical] = imputer_cat.transform(df_test[columns_categorical]) # fill

In [186]:
df_test.isna().sum().sum()

0

# Feature Encoding - Categorical =>OneHotEncoding

In [187]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
encoder.fit(df_train[columns_categorical]) # finds new column names(all categories)

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [188]:
new_columns = list(encoder.get_feature_names())



In [189]:
len(new_columns)

252

In [190]:
df_train[new_columns] = encoder.transform(df_train[columns_categorical])
df_test[new_columns] = encoder.transform(df_test[columns_categorical])

In [191]:
df_train.shape

(1450, 333)

# Feature Scaling - Numeric

In [192]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_train[columns_numeric])
df_train[columns_numeric] = scaler.transform(df_train[columns_numeric])
df_test[columns_numeric] = scaler.transform(df_test[columns_numeric])

In [193]:
columns_selected = columns_numeric + new_columns
X_train = df_train[columns_selected]
y_train = df_train['SalePrice']
X_test = df_test[columns_selected]
# y_test = 

In [194]:
len(columns_selected)

540

# Feature Reduction - PCA

In [195]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.94)
pca.fit(X_train)  # Finds out principal components (Eigen Vectors)

PCA(n_components=0.94)

In [196]:
pca.n_components_

93

In [197]:
X_train_red = pca.transform(X_train)
X_test_red = pca.transform(X_test)

In [198]:
X_train_red.shape

(1450, 93)

# Model Building

In [199]:
from sklearn.svm import SVR
model = SVR(kernel='rbf')
model.fit(X_train_red,y_train)
model.score(X_train_red,y_train)

-0.04776874788151875

In [200]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train_red,y_train)
model.score(X_train_red,y_train)

0.9644716006500328

In [201]:
yp = model.predict(X_test_red)

In [202]:
df_submit = pd.DataFrame({'Id':df_test['Id'],'SalePrice':yp})

In [203]:
df_submit.to_csv('sub7.csv',index=False)