## Import libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, PolynomialFeatures,OneHotEncoder,LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,RidgeCV,LassoCV,ElasticNetCV
from sklearn import metrics


## Import Data

In [6]:
#load train_cleaned.csv
train = pd.read_csv('../datasets/train_cleaned.csv')
train.head()

Unnamed: 0,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,...,garage_cars garage_area,garage_cars exter_qual_TA,garage_cars foundation_PConc,garage_cars kitchen_qual_TA,garage_area exter_qual_TA,garage_area foundation_PConc,garage_area kitchen_qual_TA,exter_qual_TA foundation_PConc,exter_qual_TA kitchen_qual_TA,foundation_PConc kitchen_qual_TA
0,69.0552,13517,6,8,1976,2005,289.0,533.0,0.0,192.0,...,950.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,43.0,11492,7,5,1996,1997,132.0,637.0,0.0,276.0,...,1118.0,0.0,2.0,0.0,0.0,559.0,0.0,0.0,0.0,0.0
2,68.0,7922,5,7,1953,2007,0.0,731.0,0.0,326.0,...,246.0,1.0,0.0,0.0,246.0,0.0,0.0,0.0,0.0,0.0
3,73.0,9802,5,5,2006,2007,0.0,0.0,0.0,384.0,...,800.0,2.0,2.0,2.0,400.0,400.0,400.0,1.0,1.0,1.0
4,82.0,14235,6,8,1900,1993,0.0,0.0,0.0,676.0,...,968.0,2.0,2.0,2.0,484.0,484.0,484.0,1.0,1.0,1.0


In [7]:
test = pd.read_csv('../datasets/test_cleaned.csv')
test.head()

Unnamed: 0,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,...,garage_cars garage_area,garage_cars exter_qual_TA,garage_cars foundation_PConc,garage_cars kitchen_qual_TA,garage_area exter_qual_TA,garage_area foundation_PConc,garage_area kitchen_qual_TA,exter_qual_TA foundation_PConc,exter_qual_TA kitchen_qual_TA,foundation_PConc kitchen_qual_TA
0,69.0,9142,6,8,1910,1950,0.0,0,0,1020,...,440.0,1.0,0.0,0.0,440.0,0.0,0.0,0.0,0.0,0.0
1,69.0552,9662,5,4,1977,1977,0.0,0,0,1967,...,1160.0,2.0,0.0,2.0,580.0,0.0,580.0,0.0,1.0,0.0
2,58.0,17104,7,5,2006,2006,0.0,554,0,100,...,852.0,0.0,2.0,0.0,0.0,426.0,0.0,0.0,0.0,0.0
3,60.0,8520,5,6,1923,2006,0.0,0,0,968,...,960.0,0.0,0.0,2.0,0.0,0.0,480.0,0.0,0.0,0.0
4,69.0552,9500,6,5,1963,1963,247.0,609,0,785,...,1028.0,2.0,0.0,2.0,514.0,0.0,514.0,0.0,1.0,0.0


## Pre-processing

### Get_dummies

In [4]:
# ms_subclas to categorical type
train['ms_subclass'] = train.ms_subclass.astype('str')
test['ms_subclass'] = test.ms_subclass.astype('str')

AttributeError: 'DataFrame' object has no attribute 'ms_subclass'

In [None]:
#create categorical_columns / numerical_columns
categorical_columns = [col for col in train.select_dtypes('object').columns]
numerical_columns = [col for col in train.columns if col not in categorical_columns]

In [None]:
train = pd.get_dummies(train,columns=categorical_columns,drop_first=True)
train.head()

In [None]:
train.shape

In [None]:
test = pd.get_dummies(test,columns=categorical_columns,drop_first=True)
test.head()

In [None]:
col_to_add = [col for col in train.columns if (col not in test.columns)&(col !='saleprice')]
len(col_to_add)

In [None]:
col_to_add  # These are the columns that are in train data
            # if these are selected as features, when apply to test data it will generate error

In [None]:
col_to_drop = [col for col in test.columns if (col not in train.columns)&(col !='id')]
col_to_drop # theses columns cannot be used in training model since train data doesn't has these values

In [None]:
len(col_to_drop)

In [None]:
#create new columns in test data and assign values to be 0
for col in col_to_add:
    test[col] = 0
test[col_to_add].head()

In [None]:
#drop unnecessary columns
test.drop(columns=col_to_drop,inplace=True)

In [None]:
test.shape

## Polynomial Features

In [None]:
#look at high correlation columns with sale price
train.corr()[np.abs(train.corr()['saleprice']) > 0.5]['saleprice']

In [None]:
#create polynomial columns deg =2, interaction only
poly_columns = [col for col in train.corr()[np.abs(train.corr()['saleprice']) > 0.5]['saleprice'].index if col != 'saleprice']
poly_columns

In [None]:
len(poly_columns)

In [None]:
#Instantiate PolynomialFeatures + create train_poly_df
poly = PolynomialFeatures(include_bias=False,degree=2,interaction_only=True)
train_poly = poly.fit_transform(train[poly_columns])
train_poly_df = pd.DataFrame(train_poly,columns=poly.get_feature_names(poly_columns))
train_poly_df.head()

In [None]:
#combine this to original data frame
for col in train_poly_df.columns:
    if col not in poly_columns:
        train[col] = train_poly_df[col]
train.shape #original #columns is 274 + #91 new columns (14/2*13) = 365 columns

In [None]:
#Repeat the same for test data
poly = PolynomialFeatures(include_bias=False,degree=2,interaction_only=True)
test_poly = poly.fit_transform(test[poly_columns])
test_poly_df = pd.DataFrame(test_poly,columns=poly.get_feature_names(poly_columns))
test_poly_df.head()

In [None]:
for col in test_poly_df.columns:
    if col not in poly_columns:
        test[col] = test_poly_df[col]
test.shape

## write to csv

In [None]:
#write to csv
train.to_csv('../datasets/train_cleaned_preprocessed.csv',index=False)
test.to_csv('../datasets/test_cleaned_preprocessed.csv',index=False)

#move to next notebook: Project_02_03

In [None]:
test