In [129]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

X_train = pd.read_csv('x_train_cleaned.csv')
X_test = pd.read_csv('x_test_cleaned.csv')
y_train = pd.read_csv('y_train_cleaned.csv')
y_test = pd.read_csv('y_test_cleaned.csv')

quality_and_conditions=[]

for columns in X_train.columns:
    if '_Qual' in columns or '_Cond' in columns:
        quality_and_conditions.append(columns)

quality_and_conditions.remove('Sale_Condition')
quality_and_conditions.remove('Low_Qual_Fin_SF')
quality_and_conditions.append('Bsmt_Exposure')
quality_and_conditions.append('Heating_QC')
quality_and_conditions.append('Fireplace_Qu')

X_train.loc[:,'All_Quality'] = X_train.loc[:, quality_and_conditions].sum(axis=1)

# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X_train.loc[:, ['Neighborhood','All_Quality']].groupby('Neighborhood').agg(Mean_Agg_Score=('All_Quality', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]


def get_score(x):
    return neighborhood_scores.loc[neighborhood_scores.loc[:,'Neighborhood']==x,'Mean_Agg_Score'].values[0]

# build the newly engineered attribute, call it Neighborhood Score
X_train.loc[:,'Neighborhood_Score'] = X_train.loc[:,'Neighborhood'].apply(get_score)
display(X_train)


X_test.loc[:,'All_Quality'] = X_test.loc[:, quality_and_conditions].sum(axis=1)

# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X_test.loc[:, ['Neighborhood','All_Quality']].groupby('Neighborhood').agg(Mean_Agg_Score=('All_Quality', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]

X_test.loc[:,'Neighborhood_Score'] = X_test.loc[:,'Neighborhood'].apply(get_score)
display(X_test)


Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Lot_Shape,Land_Contour,Lot_Config,Neighborhood,Condition_1,Bldg_Type,House_Style,Overall_Qual,Overall_Cond,Year_Built,Year_Remod_Add,Roof_Style,Exterior_1st,Exterior_2nd,Mas_Vnr_Type,Mas_Vnr_Area,Exter_Qual,Exter_Cond,Foundation,Bsmt_Qual,Bsmt_Cond,Bsmt_Exposure,BsmtFin_Type_1,BsmtFin_SF_1,BsmtFin_Type_2,BsmtFin_SF_2,Bsmt_Unf_SF,Total_Bsmt_SF,Heating_QC,First_Flr_SF,Second_Flr_SF,Low_Qual_Fin_SF,Gr_Liv_Area,Bsmt_Full_Bath,Bsmt_Half_Bath,Full_Bath,Half_Bath,Bedroom_AbvGr,Kitchen_AbvGr,Kitchen_Qual,Fireplaces,Fireplace_Qu,Garage_Type,Garage_Finish,Garage_Area,Garage_Qual,Wood_Deck_SF,Open_Porch_SF,Enclosed_Porch,Three_season_porch,Screen_Porch,Pool_Area,Fence,Misc_Val,Year_Sold,Sale_Type,Sale_Condition,All_Quality,Neighborhood_Score
0,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',141.0,31770.0,b'Slightly_Irregular',b'Lvl',b'Corner',b'North_Ames',b'Norm',b'OneFam',b'One_Story',5,4,1960.0,1960.0,b'Hip',b'BrkFace',b'Plywood',b'Stone',112.0,1,2,b'CBlock',3,4,4,b'BLQ',2.0,b'Unf',0.0,441.0,1080.0,1,1656.0,0.0,0.0,1656.0,1.0,0.0,1.0,0.0,3.0,1.0,2,2.0,4,b'Attchd',b'Fin',528.0,3,210.0,62.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2010.0,b'WD ',b'Normal',33,28.696864
1,b'One_Story_1946_and_Newer_All_Styles',b'Residential_High_Density',80.0,11622.0,b'Regular',b'Lvl',b'Inside',b'North_Ames',b'Feedr',b'OneFam',b'One_Story',4,5,1961.0,1961.0,b'Gable',b'VinylSd',b'VinylSd',b'None',0.0,1,2,b'CBlock',3,3,1,b'Rec',6.0,b'LwQ',144.0,270.0,882.0,2,896.0,0.0,0.0,896.0,0.0,0.0,1.0,0.0,2.0,1.0,2,0.0,0,b'Attchd',b'Unf',730.0,3,140.0,0.0,0.0,0.0,120.0,0.0,b'Minimum_Privacy',0.0,2010.0,b'WD ',b'Normal',26,28.696864
2,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',81.0,14267.0,b'Slightly_Irregular',b'Lvl',b'Corner',b'North_Ames',b'Norm',b'OneFam',b'One_Story',5,5,1958.0,1958.0,b'Hip',b'Wd Sdng',b'Wd Sdng',b'BrkFace',108.0,1,2,b'CBlock',3,3,1,b'ALQ',1.0,b'Unf',0.0,406.0,1329.0,2,1329.0,0.0,0.0,1329.0,0.0,0.0,1.0,1.0,3.0,1.0,3,0.0,0,b'Attchd',b'Unf',312.0,3,393.0,36.0,0.0,0.0,0.0,0.0,b'No_Fence',12500.0,2010.0,b'WD ',b'Normal',28,28.696864
3,b'Two_Story_1946_and_Newer',b'Residential_Low_Density',74.0,13830.0,b'Slightly_Irregular',b'Lvl',b'Inside',b'Gilbert',b'Norm',b'OneFam',b'Two_Story',4,4,1997.0,1998.0,b'Gable',b'VinylSd',b'VinylSd',b'None',0.0,1,2,b'PConc',4,3,1,b'GLQ',3.0,b'Unf',0.0,137.0,928.0,3,928.0,701.0,0.0,1629.0,0.0,0.0,2.0,1.0,3.0,1.0,2,1.0,3,b'Attchd',b'Fin',482.0,3,212.0,34.0,0.0,0.0,0.0,0.0,b'Minimum_Privacy',0.0,2010.0,b'WD ',b'Normal',30,33.517857
4,b'Two_Story_1946_and_Newer',b'Residential_Low_Density',78.0,9978.0,b'Slightly_Irregular',b'Lvl',b'Inside',b'Gilbert',b'Norm',b'OneFam',b'Two_Story',5,5,1998.0,1998.0,b'Gable',b'VinylSd',b'VinylSd',b'BrkFace',20.0,1,2,b'PConc',3,3,1,b'GLQ',3.0,b'Unf',0.0,324.0,926.0,4,926.0,678.0,0.0,1604.0,0.0,0.0,2.0,1.0,3.0,1.0,3,1.0,4,b'Attchd',b'Fin',470.0,3,360.0,36.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2010.0,b'WD ',b'Normal',34,33.517857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1945,b'Duplex_All_Styles_and_Ages',b'Residential_Low_Density',63.0,9297.0,b'Regular',b'Lvl',b'Inside',b'Mitchell',b'Norm',b'Duplex',b'One_Story',4,4,1976.0,1976.0,b'Gable',b'Plywood',b'Plywood',b'None',0.0,1,2,b'CBlock',3,3,1,b'ALQ',1.0,b'Unf',0.0,122.0,1728.0,2,1728.0,0.0,0.0,1728.0,2.0,0.0,2.0,0.0,4.0,2.0,2,0.0,0,b'Detchd',b'Unf',560.0,3,0.0,0.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2006.0,b'WD ',b'Family',25,29.649351
1946,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',80.0,17400.0,b'Regular',b'Low',b'Inside',b'Mitchell',b'Norm',b'OneFam',b'One_Story',4,4,1977.0,1977.0,b'Gable',b'BrkFace',b'BrkFace',b'None',0.0,1,2,b'CBlock',3,3,1,b'ALQ',1.0,b'Unf',0.0,190.0,1126.0,1,1126.0,0.0,0.0,1126.0,1.0,0.0,2.0,0.0,3.0,1.0,2,1.0,4,b'Attchd',b'RFn',484.0,3,295.0,41.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2006.0,b'WD ',b'Normal',28,29.649351
1947,b'Split_or_Multilevel',b'Residential_Low_Density',37.0,7937.0,b'Slightly_Irregular',b'Lvl',b'CulDSac',b'Mitchell',b'Norm',b'OneFam',b'SLvl',5,5,1984.0,1984.0,b'Gable',b'HdBoard',b'HdBoard',b'None',0.0,1,2,b'CBlock',3,3,3,b'GLQ',3.0,b'Unf',0.0,184.0,1003.0,2,1003.0,0.0,0.0,1003.0,1.0,0.0,1.0,0.0,3.0,1.0,2,0.0,0,b'Detchd',b'Unf',588.0,3,120.0,0.0,0.0,0.0,0.0,0.0,b'Good_Privacy',0.0,2006.0,b'WD ',b'Normal',29,29.649351
1948,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',0.0,8885.0,b'Slightly_Irregular',b'Low',b'Inside',b'Mitchell',b'Norm',b'OneFam',b'One_Story',4,4,1983.0,1983.0,b'Gable',b'HdBoard',b'HdBoard',b'None',0.0,1,2,b'CBlock',4,3,3,b'BLQ',2.0,b'ALQ',324.0,239.0,864.0,2,902.0,0.0,0.0,902.0,1.0,0.0,1.0,0.0,2.0,1.0,2,0.0,0,b'Attchd',b'Unf',484.0,3,164.0,0.0,0.0,0.0,0.0,0.0,b'Minimum_Privacy',0.0,2006.0,b'WD ',b'Normal',28,29.649351


Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Lot_Shape,Land_Contour,Lot_Config,Neighborhood,Condition_1,Bldg_Type,House_Style,Overall_Qual,Overall_Cond,Year_Built,Year_Remod_Add,Roof_Style,Exterior_1st,Exterior_2nd,Mas_Vnr_Type,Mas_Vnr_Area,Exter_Qual,Exter_Cond,Foundation,Bsmt_Qual,Bsmt_Cond,Bsmt_Exposure,BsmtFin_Type_1,BsmtFin_SF_1,BsmtFin_Type_2,BsmtFin_SF_2,Bsmt_Unf_SF,Total_Bsmt_SF,Heating_QC,First_Flr_SF,Second_Flr_SF,Low_Qual_Fin_SF,Gr_Liv_Area,Bsmt_Full_Bath,Bsmt_Half_Bath,Full_Bath,Half_Bath,Bedroom_AbvGr,Kitchen_AbvGr,Kitchen_Qual,Fireplaces,Fireplace_Qu,Garage_Type,Garage_Finish,Garage_Area,Garage_Qual,Wood_Deck_SF,Open_Porch_SF,Enclosed_Porch,Three_season_porch,Screen_Porch,Pool_Area,Fence,Misc_Val,Year_Sold,Sale_Type,Sale_Condition,All_Quality,Neighborhood_Score
0,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',93.0,11160.0,b'Regular',b'Lvl',b'Corner',b'North_Ames',b'Norm',b'OneFam',b'One_Story',6,4,1968.0,1968.0,b'Hip',b'BrkFace',b'BrkFace',b'None',0.0,2,2,b'CBlock',3,3,1,b'ALQ',1.0,b'Unf',0.0,1045.0,2110.0,4,2110.0,0.0,0.0,2110.0,1.0,0.0,2.0,1.0,3.0,1.0,4,2.0,3,b'Attchd',b'Fin',522.0,3,0.0,0.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2010.0,b'WD ',b'Normal',35,28.570513
1,b'Two_Story_1946_and_Newer',b'Residential_Low_Density',60.0,7500.0,b'Regular',b'Lvl',b'Inside',b'Gilbert',b'Norm',b'OneFam',b'Two_Story',6,4,1999.0,1999.0,b'Gable',b'VinylSd',b'VinylSd',b'None',0.0,1,2,b'PConc',3,3,1,b'Unf',7.0,b'Unf',0.0,994.0,994.0,3,1028.0,776.0,0.0,1804.0,0.0,0.0,2.0,1.0,3.0,1.0,3,1.0,3,b'Attchd',b'Fin',442.0,3,140.0,60.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2010.0,b'WD ',b'Normal',32,33.754717
2,b'Two_Story_1946_and_Newer',b'Residential_Low_Density',75.0,10000.0,b'Slightly_Irregular',b'Lvl',b'Corner',b'Gilbert',b'Norm',b'OneFam',b'Two_Story',5,4,1993.0,1994.0,b'Gable',b'HdBoard',b'HdBoard',b'None',0.0,1,2,b'PConc',4,3,1,b'Unf',7.0,b'Unf',0.0,763.0,763.0,3,763.0,892.0,0.0,1655.0,0.0,0.0,2.0,1.0,3.0,1.0,2,1.0,3,b'Attchd',b'Fin',440.0,3,157.0,84.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2010.0,b'WD ',b'Normal',31,33.754717
3,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',0.0,7980.0,b'Slightly_Irregular',b'Lvl',b'Inside',b'Gilbert',b'Norm',b'OneFam',b'One_Story',5,6,1992.0,2007.0,b'Gable',b'HdBoard',b'HdBoard',b'None',0.0,1,3,b'PConc',4,3,1,b'ALQ',1.0,b'Unf',0.0,233.0,1168.0,4,1187.0,0.0,0.0,1187.0,1.0,0.0,2.0,0.0,3.0,1.0,2,0.0,0,b'Attchd',b'Fin',420.0,3,483.0,21.0,0.0,0.0,0.0,0.0,b'Good_Privacy',500.0,2010.0,b'WD ',b'Normal',32,33.754717
4,b'Two_Story_1946_and_Newer',b'Residential_Low_Density',63.0,8402.0,b'Slightly_Irregular',b'Lvl',b'Inside',b'Gilbert',b'Norm',b'OneFam',b'Two_Story',5,4,1998.0,1998.0,b'Gable',b'VinylSd',b'VinylSd',b'None',0.0,1,2,b'PConc',4,3,1,b'Unf',7.0,b'Unf',0.0,789.0,789.0,3,789.0,676.0,0.0,1465.0,0.0,0.0,2.0,1.0,3.0,1.0,2,1.0,4,b'Attchd',b'Fin',393.0,3,0.0,75.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2010.0,b'WD ',b'Normal',32,33.754717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,b'Duplex_All_Styles_and_Ages',b'Residential_Low_Density',0.0,11836.0,b'Slightly_Irregular',b'Lvl',b'Corner',b'Mitchell',b'Norm',b'Duplex',b'One_Story',4,4,1970.0,1970.0,b'Gable',b'Plywood',b'Plywood',b'None',0.0,1,2,b'CBlock',3,3,1,b'BLQ',2.0,b'Unf',0.0,1503.0,1652.0,2,1652.0,0.0,0.0,1652.0,0.0,0.0,2.0,0.0,4.0,2.0,2,0.0,0,b'More_Than_Two_Types',b'Unf',928.0,3,0.0,0.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2006.0,b'WD ',b'Normal',25,29.648649
970,b'Two_Story_PUD_1946_and_Newer',b'Residential_Medium_Density',21.0,1936.0,b'Regular',b'Lvl',b'Inside',b'Meadow_Village',b'Norm',b'Twnhs',b'Two_Story',3,6,1970.0,1970.0,b'Gable',b'CemntBd',b'CmentBd',b'None',0.0,1,2,b'CBlock',3,3,1,b'Unf',7.0,b'Unf',0.0,546.0,546.0,3,546.0,546.0,0.0,1092.0,0.0,0.0,1.0,1.0,3.0,1.0,2,0.0,0,b'No_Garage',b'No_Garage',0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2006.0,b'WD ',b'Normal',24,26.833333
971,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',160.0,20000.0,b'Regular',b'Lvl',b'Inside',b'Mitchell',b'Norm',b'OneFam',b'One_Story',4,6,1960.0,1996.0,b'Gable',b'VinylSd',b'VinylSd',b'None',0.0,1,2,b'CBlock',3,3,1,b'ALQ',1.0,b'Unf',0.0,0.0,1224.0,4,1224.0,0.0,0.0,1224.0,1.0,0.0,1.0,0.0,4.0,1.0,2,1.0,3,b'Detchd',b'Unf',576.0,3,474.0,0.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2006.0,b'WD ',b'Abnorml',32,29.648649
972,b'One_Story_1946_and_Newer_All_Styles',b'Residential_Low_Density',77.0,10010.0,b'Regular',b'Lvl',b'Inside',b'Mitchell',b'Norm',b'OneFam',b'One_Story',4,4,1974.0,1975.0,b'Gable',b'HdBoard',b'HdBoard',b'None',0.0,1,2,b'CBlock',4,3,3,b'ALQ',1.0,b'LwQ',123.0,195.0,1389.0,3,1389.0,0.0,0.0,1389.0,1.0,0.0,1.0,0.0,2.0,1.0,2,1.0,3,b'Attchd',b'RFn',418.0,3,240.0,38.0,0.0,0.0,0.0,0.0,b'No_Fence',0.0,2006.0,b'WD ',b'Normal',32,29.648649


Creiamo una nuova feature Total_Qual che è pari alla somma di tutti i campi qualitativi
Creiamo una nuova feature Neighborhood_Score che si basa su Total_Qual per assegnargli un valore qualitativo al posto di nominale

TODO

In [130]:
porch = ['Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch']
X_train.loc[:,'Total_External_SF'] = X_train.loc[:, porch].sum(axis=1)
X_test.loc[:,'Total_External_SF'] = X_test.loc[:, porch].sum(axis=1)

display(X_train['Total_External_SF'])

0       272.0
1       260.0
2       429.0
3       246.0
4       396.0
        ...  
1945      0.0
1946    336.0
1947    120.0
1948    164.0
1949    112.0
Name: Total_External_SF, Length: 1950, dtype: float64

# Total_External_SF
Creating a new feature that represents the surface of all porch or decks outside the property, trying to combine similar information into a single feature.

In [131]:
surface = ['Total_Finished_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Garage_Area']

X_train['Total_Finished_Bsmt_SF'] = X_train['Total_Bsmt_SF'] - X_train['Bsmt_Unf_SF']
X_test['Total_Finished_Bsmt_SF'] = X_test['Total_Bsmt_SF'] - X_test['Bsmt_Unf_SF']

X_train.loc[:, 'Total_SF'] = X_train.loc[:, surface].sum(axis=1)
X_test.loc[:, 'Total_SF'] = X_test.loc[:, surface].sum(axis=1)
#vediamo se fare anche Total senza unfinished

display(X_train['Total_SF'])

0       2823.0
1       2238.0
2       2564.0
3       2902.0
4       2676.0
         ...  
1945    3894.0
1946    2546.0
1947    2410.0
1948    2011.0
1949    1307.0
Name: Total_SF, Length: 1950, dtype: float64

# Total_SF
Creating a new feature that represents the surface of the inside of the property, trying to combine similar information into a single feature.

In [132]:
baths = ['Full_Bath', 'Half_Bath', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath']

X_train.loc[:, 'Total_Baths'] = X_train.loc[:, baths].sum(axis=1)
X_test.loc[:, 'Total_Baths'] = X_test.loc[:, baths].sum(axis=1)

display(X_train['Total_Baths'])

0       2.0
1       1.0
2       2.0
3       3.0
4       3.0
       ... 
1945    4.0
1946    3.0
1947    2.0
1948    2.0
1949    2.0
Name: Total_Baths, Length: 1950, dtype: float64

# Total_Baths
Creating a new feature that represents the sum of all the types of bathrooms, trying to add new information to the dataset.

In [133]:
X_train.loc[:,'Year_To_Sell'] = X_train.loc[:,'Year_Sold'] - X_train.loc[:,'Year_Built']
X_test.loc[:,'Year_To_Sell'] = X_test.loc[:,'Year_Sold'] - X_test.loc[:,'Year_Built']

display(X_train['Year_To_Sell'])

0       50.0
1       49.0
2       52.0
3       13.0
4       12.0
        ... 
1945    30.0
1946    29.0
1947    22.0
1948    23.0
1949    14.0
Name: Year_To_Sell, Length: 1950, dtype: float64

# Year_To_Sell
Creating a new feature that represents the span of years required to sell the house, trying to add new information to the dataset.

In [134]:
X_train = pd.get_dummies(X_train.loc[:,[col for col in X_train.columns]])
print('x_train_v2 shape after get_dummies: {}'.format(X_train.shape))
X_test = pd.get_dummies(X_test.loc[:,[col for col in X_test.columns]])
print('x_test_v2 shape after get_dummies: {}'.format(X_test.shape))

for col in X_train.columns:
    if col not in X_test.columns:
        print('\'{}\' found missing in x_test_v2, initialising new column with 0s.'.format(col))
        X_test.loc[:,col] = 0
print('\n')
for col in X_test.columns:
    if col not in X_train.columns:
        print('\'{}\' found missing in x_train_v2, initialising new column with 0s.'.format(col))
        X_train.loc[:,col] = 0
print('\n')
print('Check if both datasets have the same set of columns: {}'.format(set(X_train.columns) == set(X_test.columns)))
print('\n')
print('Current number of columns: {}'.format(X_train.shape[1]))

# ensure that column sequence is similar too
column_list = list(X_train.columns)
X_test = X_test.loc[:,column_list]

x_train_v2 shape after get_dummies: (1950, 227)
x_test_v2 shape after get_dummies: (974, 215)
'MS_SubClass_b'One_Story_with_Finished_Attic_All_Ages'' found missing in x_test_v2, initialising new column with 0s.
'MS_SubClass_b'One_and_Half_Story_PUD_All_Ages'' found missing in x_test_v2, initialising new column with 0s.
'MS_Zoning_b'I_all'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Green_Hills'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Greens'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Landmark'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'AsphShn'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'ImStucc'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'PreCast'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'Stone'' found missing in x_test_v2, initialising new co

# One Hot Encoding

Applying the one hot encoding using the get_dummies function of pandas.
Since it's possible that a nominal value is not present in one of the categorical columns, resulting in more or lesser columns generated in train split vs test split.
Assuring to reach the same of features by filling with 0 the ones that are missing

In [135]:
ss = StandardScaler()

cols = X_train.columns

# perform the normalization
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
X_test.to_csv('x_test_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)

# Standard Scaler
Standardizing features by removing the mean and scaling to unit variance (-1, 1). Work best with normal variance features

In [136]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

cols = X_train.columns

# perform the normalization
X_train = mm.fit_transform(X_train)
X_test = mm.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed_minmax.csv', index=False)
y_train.to_csv('y_train_preprocessed_minmax.csv', index=False)
X_test.to_csv('x_test_preprocessed_minmax.csv', index=False)
y_test.to_csv('y_test_preprocessed_minmax.csv', index=False)

# Min Max Scaler
Transforming features by scaling each feature to a given range (0, 1).