In [4]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

X_train = pd.read_csv('x_train_cleaned.csv')
X_test = pd.read_csv('x_test_cleaned.csv')
y_train = pd.read_csv('y_train_cleaned.csv')
y_test = pd.read_csv('y_test_cleaned.csv')

quality_and_conditions=[]

for columns in X_train.columns:
    if '_Qual' in columns or '_Cond' in columns:
        quality_and_conditions.append(columns)

quality_and_conditions.remove('Sale_Condition')
quality_and_conditions.remove('Low_Qual_Fin_SF')
quality_and_conditions.append('Bsmt_Exposure')
quality_and_conditions.append('Heating_QC')
quality_and_conditions.append('Fireplace_Qu')

X_train.loc[:,'All_Quality'] = X_train.loc[:, quality_and_conditions].sum(axis=1)

# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X_train.loc[:, ['Neighborhood','All_Quality']].groupby('Neighborhood').agg(Mean_Agg_Score=('All_Quality', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]

display(neighborhood_scores)
display(X_train['All_Quality'])

def get_score(x):
    return neighborhood_scores.loc[neighborhood_scores.loc[:,'Neighborhood']==x,'Mean_Agg_Score'].values[0]

# build the newly engineered attribute, call it Neighborhood Score
X_train.loc[:,'Neighborhood_Score'] = X_train.loc[:,'Neighborhood']

X_test.loc[:,'All_Quality'] = X_test.loc[:, quality_and_conditions].sum(axis=1)
# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X_test.loc[:, ['Neighborhood','All_Quality']].groupby('Neighborhood').agg(Mean_Agg_Score=('All_Quality', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]

X_test.loc[:,'Neighborhood_Score'] = X_test.loc[:,'Neighborhood'].apply(get_score)

#display(X_train['Neighborhood_Score'])

#X_train.drop(columns=['Neighborhood'], inplace=True)
#X_test.drop(columns=['Neighborhood'], inplace=True)

#Creiamo una nuova feature Total_Qual che è pari alla somma di tutti i campi qualitativi
#Creiamo una nuova feature Neighborhood_Score che si basa su Total_Qual per assegnargli un valore qualitativo al posto di nominale

Unnamed: 0,Neighborhood,Mean_Agg_Score
0,b'Northridge_Heights',40.008696
1,b'Stone_Brook',38.85
2,b'Northridge',37.62963
3,b'Greens',37.142857
4,b'Bloomington_Heights',37.083333
5,b'Veenker',37.052632
6,b'Blueste',36.333333
7,b'Timberland',36.222222
8,b'Somerset',35.431818
9,b'Crawford',33.828571


0       33
1       26
2       28
3       34
4       34
        ..
2188    25
2189    28
2190    29
2191    28
2192    25
Name: All_Quality, Length: 2193, dtype: int64

Creiamo una nuova feature Total_Qual che è pari alla somma di tutti i campi qualitativi
Creiamo una nuova feature Neighborhood_Score che si basa su Total_Qual per assegnargli un valore qualitativo al posto di nominale

TODO

In [5]:
porch = ['Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch']
X_train.loc[:,'Total_External_SF'] = X_train.loc[:, porch].sum(axis=1)
X_test.loc[:,'Total_External_SF'] = X_test.loc[:, porch].sum(axis=1)

display(X_train['Total_External_SF'])

0       272.0
1       260.0
2       429.0
3       396.0
4       170.0
        ...  
2188      0.0
2189    336.0
2190    120.0
2191    164.0
2192    112.0
Name: Total_External_SF, Length: 2193, dtype: float64

# Total_External_SF
Creating a new feature that represents the surface of all porch or decks outside the property, trying to combine similar information into a single feature.

In [6]:
surface = ['Total_Finished_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Garage_Area']

X_train['Total_Finished_Bsmt_SF'] = X_train['Total_Bsmt_SF'] - X_train['Bsmt_Unf_SF']
X_test['Total_Finished_Bsmt_SF'] = X_test['Total_Bsmt_SF'] - X_test['Bsmt_Unf_SF']

X_train.loc[:, 'Total_SF'] = X_train.loc[:, surface].sum(axis=1)
X_test.loc[:, 'Total_SF'] = X_test.loc[:, surface].sum(axis=1)
#vediamo se fare anche Total senza unfinished

display(X_train['Total_SF'])

0       2823.0
1       2238.0
2       2564.0
3       2676.0
4       2536.0
         ...  
2188    3894.0
2189    2546.0
2190    2410.0
2191    2011.0
2192    1307.0
Name: Total_SF, Length: 2193, dtype: float64

# Total_SF
Creating a new feature that represents the surface of the inside of the property, trying to combine similar information into a single feature.

In [7]:
baths = ['Full_Bath', 'Half_Bath', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath']

X_train.loc[:, 'Total_Baths'] = X_train.loc[:, baths].sum(axis=1)
X_test.loc[:, 'Total_Baths'] = X_test.loc[:, baths].sum(axis=1)

display(X_train['Total_Baths'])

0       2.0
1       1.0
2       2.0
3       3.0
4       3.0
       ... 
2188    4.0
2189    3.0
2190    2.0
2191    2.0
2192    2.0
Name: Total_Baths, Length: 2193, dtype: float64

# Total_Baths
Creating a new feature that represents the sum of all the types of bathrooms, trying to add new information to the dataset.

In [8]:
X_train.loc[:,'Year_To_Sell'] = X_train.loc[:,'Year_Sold'] - X_train.loc[:,'Year_Built']
X_test.loc[:,'Year_To_Sell'] = X_test.loc[:,'Year_Sold'] - X_test.loc[:,'Year_Built']

display(X_train['Year_To_Sell'])

0       50.0
1       49.0
2       52.0
3       12.0
4        9.0
        ... 
2188    30.0
2189    29.0
2190    22.0
2191    23.0
2192    14.0
Name: Year_To_Sell, Length: 2193, dtype: float64

# Year_To_Sell
Creating a new feature that represents the span of years required to sell the house, trying to add new information to the dataset.

In [9]:
X_train = pd.get_dummies(X_train.loc[:,[col for col in X_train.columns]])
print('x_train_v2 shape after get_dummies: {}'.format(X_train.shape))
X_test = pd.get_dummies(X_test.loc[:,[col for col in X_test.columns]])
print('x_test_v2 shape after get_dummies: {}'.format(X_test.shape))

for col in X_train.columns:
    if col not in X_test.columns:
        print('\'{}\' found missing in x_test_v2, initialising new column with 0s.'.format(col))
        X_test.loc[:,col] = 0
print('\n')
for col in X_test.columns:
    if col not in X_train.columns:
        print('\'{}\' found missing in x_train_v2, initialising new column with 0s.'.format(col))
        X_train.loc[:,col] = 0
print('\n')
print('Check if both datasets have the same set of columns: {}'.format(set(X_train.columns) == set(X_test.columns)))
print('\n')
print('Current number of columns: {}'.format(X_train.shape[1]))

# ensure that column sequence is similar too
column_list = list(X_train.columns)
X_test = X_test.loc[:,column_list]

x_train_v2 shape after get_dummies: (2193, 254)
x_test_v2 shape after get_dummies: (731, 214)
'MS_SubClass_b'One_Story_with_Finished_Attic_All_Ages'' found missing in x_test_v2, initialising new column with 0s.
'MS_SubClass_b'One_and_Half_Story_PUD_All_Ages'' found missing in x_test_v2, initialising new column with 0s.
'MS_Zoning_b'A_agr'' found missing in x_test_v2, initialising new column with 0s.
'MS_Zoning_b'I_all'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Green_Hills'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Landmark'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'AsphShn'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'ImStucc'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'PreCast'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'Stone'' found missing in x_test_v2, initialising new column

# One Hot Encoding

Applying the one hot encoding using the get_dummies function of pandas.
Since it's possible that a nominal value is not present in one of the categorical columns, resulting in more or lesser columns generated in train split vs test split.
Assuring to reach the same of features by filling with 0 the ones that are missing

In [10]:
ss = StandardScaler()

cols = X_train.columns

# perform the normalization
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
X_test.to_csv('x_test_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)

# Standard Scaler
Standardizing features by removing the mean and scaling to unit variance (-1, 1). Work best with normal variance features

In [11]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

cols = X_train.columns

# perform the normalization
X_train = mm.fit_transform(X_train)
X_test = mm.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed_minmax.csv', index=False)
y_train.to_csv('y_train_preprocessed_minmax.csv', index=False)
X_test.to_csv('x_test_preprocessed_minmax.csv', index=False)
y_test.to_csv('y_test_preprocessed_minmax.csv', index=False)

# Min Max Scaler
Transforming features by scaling each feature to a given range (0, 1).