In [10]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

X = pd.read_csv('x_cleaned_pca.csv')
y = pd.read_csv('y_cleaned_pca.csv')

quality_and_conditions=[]

for columns in X.columns:
    if '_Qual' in columns or '_Cond' in columns:
        quality_and_conditions.append(columns)

quality_and_conditions.remove('Sale_Condition')
quality_and_conditions.remove('Low_Qual_Fin_SF')
quality_and_conditions.append('Bsmt_Exposure')
quality_and_conditions.append('Heating_QC')
quality_and_conditions.append('Fireplace_Qu')

X.loc[:,'All_Quality'] = X.loc[:, quality_and_conditions].sum(axis=1)

# perform groupby on `Neighborhood`, then aggregate via averaging
neighborhood_scores = X.loc[:, ['Neighborhood','All_Quality']].groupby('Neighborhood').agg(Mean_Agg_Score=('All_Quality', np.mean))

# sort dataframe by descending order of the mean score
neighborhood_scores.sort_values(by='Mean_Agg_Score', ascending=False, inplace=True)

# extract out neighborhood names from the index
neighborhood_scores.loc[:,'Neighborhood'] = neighborhood_scores.index

# reset index to the usual numeric form
neighborhood_scores.reset_index(drop=True, inplace=True)

# re-arrange columns to being Neighborhood to left-most position
neighborhood_scores = neighborhood_scores.loc[:, ['Neighborhood', 'Mean_Agg_Score']]


def get_score(x):
    return neighborhood_scores.loc[neighborhood_scores.loc[:,'Neighborhood']==x,'Mean_Agg_Score'].values[0]

X.loc[:,'Neighborhood_Score'] = X.loc[:,'Neighborhood'].apply(get_score)



Creiamo una nuova feature Total_Qual che è pari alla somma di tutti i campi qualitativi
Creiamo una nuova feature Neighborhood_Score che si basa su Total_Qual per assegnargli un valore qualitativo al posto di nominale

TODO

In [11]:
porch = ['Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch']
X.loc[:,'Total_External_SF'] = X.loc[:, porch].sum(axis=1)

display(X['Total_External_SF'])

0       272.0
1       260.0
2       429.0
3         0.0
4       246.0
        ...  
2919    120.0
2920    164.0
2921    112.0
2922    278.0
2923    238.0
Name: Total_External_SF, Length: 2924, dtype: float64

# Total_External_SF
Creating a new feature that represents the surface of all porch or decks outside the property, trying to combine similar information into a single feature.

In [12]:
surface = ['Total_Finished_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Garage_Area']
X['Total_Finished_Bsmt_SF'] = X['Total_Bsmt_SF'] - X['Bsmt_Unf_SF']
X.loc[:, 'Total_SF'] = X.loc[:, surface].sum(axis=1)
#vediamo se fare anche Total senza unfinished

display(X['Total_SF'])

0       2823.0
1       2238.0
2       2564.0
3       3697.0
4       2902.0
         ...  
2919    2410.0
2920    2011.0
2921    1307.0
2922    3001.0
2923    3408.0
Name: Total_SF, Length: 2924, dtype: float64

# Total_SF
Creating a new feature that represents the surface of the inside of the property, trying to combine similar information into a single feature.

In [13]:
baths = ['Full_Bath', 'Half_Bath', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath']
X.loc[:, 'Total_Baths'] = X.loc[:, baths].sum(axis=1)

display(X['Total_Baths'])

0       2.0
1       1.0
2       2.0
3       4.0
4       3.0
       ... 
2919    2.0
2920    2.0
2921    2.0
2922    2.0
2923    3.0
Name: Total_Baths, Length: 2924, dtype: float64

# Total_Baths
Creating a new feature that represents the sum of all the types of bathrooms, trying to add new information to the dataset.

In [14]:
X.loc[:,'Year_To_Sell'] = X.loc[:,'Year_Sold'] - X.loc[:,'Year_Built']

display(X['Year_To_Sell'])

0       50.0
1       49.0
2       52.0
3       42.0
4       13.0
        ... 
2919    22.0
2920    23.0
2921    14.0
2922    32.0
2923    13.0
Name: Year_To_Sell, Length: 2924, dtype: float64

# Year_To_Sell
Creating a new feature that represents the span of years required to sell the house, trying to add new information to the dataset.

In [15]:
from sklearn.decomposition import PCA

pca_train = PCA(n_components=215,random_state= 1)
pca_train.fit(X)
pca_train.components_

# project data onto the selected components
X_proj_train = pca_train.transform(X)
#df_train = pd.DataFrame(data=X_proj_train, columns=cols)
df_train = pd.DataFrame(X_proj_train)
display(df_train)



ValueError: could not convert string to float: "b'One_Story_1946_and_Newer_All_Styles'"

# PCA

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4, random_state=1, shuffle=True)

for train, test in skf.split(X, y):
    X_train = X.iloc[train]
    y_train = y.iloc[train]
    X_test = X.iloc[test]
    y_test = y.iloc[test]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# SPLIT AFTER PCA

In [None]:
X_train = pd.get_dummies(X_train.loc[:,[col for col in X_train.columns]])
print('x_train_v2 shape after get_dummies: {}'.format(X_train.shape))
X_test = pd.get_dummies(X_test.loc[:,[col for col in X_test.columns]])
print('x_test_v2 shape after get_dummies: {}'.format(X_test.shape))

for col in X_train.columns:
    if col not in X_test.columns:
        print('\'{}\' found missing in x_test_v2, initialising new column with 0s.'.format(col))
        X_test.loc[:,col] = 0
print('\n')
for col in X_test.columns:
    if col not in X_train.columns:
        print('\'{}\' found missing in x_train_v2, initialising new column with 0s.'.format(col))
        X_train.loc[:,col] = 0
print('\n')
print('Check if both datasets have the same set of columns: {}'.format(set(X_train.columns) == set(X_test.columns)))
print('\n')
print('Current number of columns: {}'.format(X_train.shape[1]))

# ensure that column sequence is similar too
column_list = list(X_train.columns)
X_test = X_test.loc[:,column_list]

# One Hot Encoding

Applying the one hot encoding using the get_dummies function of pandas.
Since it's possible that a nominal value is not present in one of the categorical columns, resulting in more or lesser columns generated in train split vs test split.
Assuring to reach the same of features by filling with 0 the ones that are missing.

In [None]:
ss = StandardScaler()

cols = X_train.columns

# perform the normalization
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed_pca.csv', index=False)
y_train.to_csv('y_train_preprocessed_pca.csv', index=False)
X_test.to_csv('x_test_preprocessed_pca.csv', index=False)
y_test.to_csv('y_test_preprocessed_pca.csv', index=False)

# Standard Scaler
Standardizing features by removing the mean and scaling to unit variance (-1, 1). Work best with normal variance features

In [None]:
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()

cols = X_train.columns

# perform the normalization
X_train = mm.fit_transform(X_train)
X_test = mm.fit_transform(X_test)

X_train = pd.DataFrame(data=X_train, columns=cols)
X_test = pd.DataFrame(data=X_test, columns=cols)

X_train.to_csv('x_train_preprocessed_minmax_pca.csv', index=False)
y_train.to_csv('y_train_preprocessed_minmax_pca.csv', index=False)
X_test.to_csv('x_test_preprocessed_minmax_pca.csv', index=False)
y_test.to_csv('y_test_preprocessed_minmax_pca.csv', index=False)

# Min Max Scaler
Transforming features by scaling each feature to a given range (0, 1).