In [92]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

X_train = pd.read_csv('x_train_cleaned.csv')
X_test = pd.read_csv('x_test_cleaned.csv')
y_train = pd.read_csv('y_train_cleaned.csv')
y_test = pd.read_csv('y_test_cleaned.csv')

quality_and_conditions=[]

for columns in X_train.columns:
    if '_Qual' in columns or '_Cond' in columns:
        quality_and_conditions.append(columns)

quality_and_conditions.append('Bsmt_Exposure')
quality_and_conditions.append('Heating_QC')
quality_and_conditions.append('Fireplace_Qu')

X_train.loc[:,'Total_Qual'] = X_train.loc[:, quality_and_conditions].sum(axis=1)

# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X_train.loc[:, ['Neighborhood','Total_Qual']].groupby('Neighborhood').agg(Mean_Agg_Score=('Total_Qual', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]

display(neighborhood_scores)
#display(X_train['Total_Qual'])

def get_score(x):
    return neighborhood_scores.loc[neighborhood_scores.loc[:,'Neighborhood']==x,'Mean_Agg_Score'].values[0]

# build the newly engineered attribute, call it Neighborhood Score
X_train.loc[:,'Neighborhood_Score'] = X_train.loc[:,'Neighborhood'].apply(get_score)

X_test.loc[:,'Total_Qual'] = X_test.loc[:, quality_and_conditions].sum(axis=1)
# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X_test.loc[:, ['Neighborhood','Total_Qual']].groupby('Neighborhood').agg(Mean_Agg_Score=('Total_Qual', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]

X_test.loc[:,'Neighborhood_Score'] = X_test.loc[:,'Neighborhood'].apply(get_score)

#display(X_train['Neighborhood_Score'])

#X_train.drop(columns=['Neighborhood'], inplace=True)
#X_test.drop(columns=['Neighborhood'], inplace=True)

Unnamed: 0,Neighborhood,Mean_Agg_Score
0,b'South_and_West_of_Iowa_State_University',127.785714
1,b'Iowa_DOT_and_Rail_Road',49.229167
2,b'Northridge_Heights',40.013699
3,b'Stone_Brook',38.758621
4,b'Old_Town',37.672269
5,b'Greens',37.333333
6,b'Northridge',36.96875
7,b'Bloomington_Heights',36.823529
8,b'Timberland',36.641026
9,b'Veenker',36.625


Creiamo una nuova feature Total_Qual che è pari alla somma di tutti i campi qualitativi
Creiamo una nuova feature Neighborhood_Score che si basa su Total_Qual per assegnargli un valore qualitativo al posto di nominale

In [93]:
porch = ['Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch']
X_train.loc[:,'Total_Porch_SF'] = X_train.loc[:, porch].sum(axis=1)
X_test.loc[:,'Total_Porch_SF'] = X_test.loc[:, porch].sum(axis=1)

display(X_train['Total_Porch_SF'])

0       272.0
1       396.0
2       170.0
3       226.0
4       192.0
        ...  
1457     24.0
1458      0.0
1459    120.0
1460    164.0
1461    112.0
Name: Total_Porch_SF, Length: 1462, dtype: float64

In [94]:
surface = ['Total_Finished_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Garage_Area']

X_train['Total_Finished_Bsmt_SF'] = X_train['Total_Bsmt_SF'] - X_train['Bsmt_Unf_SF']
X_test['Total_Finished_Bsmt_SF'] = X_test['Total_Bsmt_SF'] - X_test['Bsmt_Unf_SF']

X_train.loc[:, 'Total_SF'] = X_train.loc[:, surface].sum(axis=1)
X_test.loc[:, 'Total_SF'] = X_test.loc[:, surface].sum(axis=1)
#vediamo se fare anche Total senza unfinished

display(X_train['Total_SF'])

0       2823.0
1       2676.0
2       2536.0
3       2049.0
4       2484.0
         ...  
1457    1630.0
1458    3894.0
1459    2410.0
1460    2011.0
1461    1307.0
Name: Total_SF, Length: 1462, dtype: float64

In [95]:
baths = ['Full_Bath', 'Half_Bath', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath']

X_train.loc[:, 'Total_Baths'] = X_train.loc[:, baths].sum(axis=1)
X_test.loc[:, 'Total_Baths'] = X_test.loc[:, baths].sum(axis=1)

display(X_train['Total_Baths'])

0       2.0
1       3.0
2       3.0
3       2.0
4       3.0
       ... 
1457    2.0
1458    4.0
1459    2.0
1460    2.0
1461    2.0
Name: Total_Baths, Length: 1462, dtype: float64

In [96]:
X_train.loc[:,'Age at sale'] = X_train.loc[:,'Year_Sold'] - X_train.loc[:,'Year_Built']
X_test.loc[:,'Age at sale'] = X_test.loc[:,'Year_Sold'] - X_test.loc[:,'Year_Built']

display(X_train['Age at sale'])

0       50.0
1       12.0
2        9.0
3       18.0
4       20.0
        ... 
1457    36.0
1458    30.0
1459    22.0
1460    23.0
1461    14.0
Name: Age at sale, Length: 1462, dtype: float64

In [97]:
X_train = pd.get_dummies(X_train.loc[:,[col for col in X_train.columns]])
print('x_train_v2 shape after get_dummies: {}'.format(X_train.shape))
# possible that a nominal value is not present in one of the nominal columns, resulting in more/lesser columns generated
X_test = pd.get_dummies(X_test.loc[:,[col for col in X_test.columns]])
print('x_test_v2 shape after get_dummies: {}'.format(X_test.shape))

x_train_v2 shape after get_dummies: (1462, 223)
x_test_v2 shape after get_dummies: (1462, 220)


Effettuiamo one-hot-encoding sulle colonne "Stringa rimaste" non qualitative .

In [98]:
for col in X_train.columns:
    if col not in X_test.columns:
        print('\'{}\' found missing in x_test_v2, initialising new column with 0s.'.format(col))
        X_test.loc[:,col] = 0
print('\n')
for col in X_test.columns:
    if col not in X_train.columns:
        print('\'{}\' found missing in x_train_v2, initialising new column with 0s.'.format(col))
        X_train.loc[:,col] = 0
print('\n')
print('Check if both datasets have the same set of columns: {}'.format(set(X_train.columns) == set(X_test.columns)))
print('\n')
print('Current number of columns: {}'.format(X_train.shape[1]))

# ensure that column sequence is similar too
column_list = list(X_train.columns)
X_test = X_test.loc[:,column_list]

'MS_SubClass_b'One_Story_with_Finished_Attic_All_Ages'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Green_Hills'' found missing in x_test_v2, initialising new column with 0s.
'Neighborhood_b'Landmark'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'ImStucc'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'PreCast'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_1st_b'Stone'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_2nd_b'Other'' found missing in x_test_v2, initialising new column with 0s.
'Exterior_2nd_b'PreCast'' found missing in x_test_v2, initialising new column with 0s.
'Sale_Type_b'Con'' found missing in x_test_v2, initialising new column with 0s.


'MS_SubClass_b'One_and_Half_Story_PUD_All_Ages'' found missing in x_train_v2, initialising new column with 0s.
'Neighborhood_b'Blueste'' found missing in x_train_v2, initialising new column w

Se abbiamo un numero di feature differenti tra train e test risolviamo il problema ...  guardare file git

In [99]:
ss = StandardScaler()

cols = X_train.columns

# perform the normalization
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
X_test.to_csv('x_test_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)