In [1]:
import pandas as pd 
import numpy as np
import dill

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from dython.nominal import associations
from dython.nominal import identify_nominal_columns
from dython.data_utils import identify_columns_with_na

from functions import initialize
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import dill
from plotly.express.colors import sample_colorscale
from sklearn.preprocessing import minmax_scale



In [2]:
X_train,X_test,house_df=initialize()
 

In [3]:
X_train=X_train[sorted(X_train)]

# FEATURE ENGINEERING
- Per ogni feature che da la possibilità di estrapolare una nuova feature di tipo binaria cerchiamo di estrapolarla
e verifichiamo se può essere un predittore interessante per il nostro modello
- Inoltre cerchiamo di raggruppare della varianza frammentata in diverse features in una sola creando una nuova feature
- Visualizzazione grafica tramite plot

In [4]:
'''
Create new Feature Has_Pool based on Pool_QC
Has_Pool confirm if a House has a pool or not
'''
def transform_pool(df):
    df["Has_Pool"]=np.where(df["Pool_QC"].str.contains('No_Pool'), 0 , 1) #create new column, fill with 1 if house has Pool, else 0
    #print(len(house_df[house_df['Pool_QC']=='Excellent'])) #4 house have excellent pool
    #df.drop("Pool_QC", axis=1, inplace=True) #drop "Pool_Quality" feature, keep only "Has_Pool" feature

In [5]:
transform_pool(X_train)

In [6]:
fig=px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price", color="Pool_QC",width=1000,facet_col="Has_Pool")
fig.show()

In [7]:
'''
Merge all bath features in one quantitative feature
'''
def transform_bathroom(df):
    #get all bath features
    bath=[x for x in df.columns if 'Bath' in x] 
    #sum all feature values and save it inside total bath feature
    #df['TotBath'] = df['FullBath'] + (0.5* df['HalfBath']) + df['BsmtFullBath'] + (0.5*df['BsmtHalfBath']) 
    df['Total_Bath']=0
    for x in bath :
        if 'Half' in x :
            df['Total_Bath'] += df[x]*0.5
        else:
            df['Total_Bath'] += df[x]
            
    #drop old bath features
    #df.drop(['Full_Bath', 'Half_Bath', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath'], axis=1, inplace=True)

In [8]:
transform_bathroom(X_train)

In [9]:
fig=px.box(X_train, x="Total_Bath", y="Sale_Price",color="Total_Bath", points='all', height=800, width=1100)
fig.show()
X_train['Total_Bath'] = X_train['Total_Bath'].astype(str)
fig1=px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price", color="Total_Bath", width=1100, marginal_y='box')
fig1.show()
X_train['Total_Bath'] = X_train['Total_Bath'].astype(float)

In [10]:
def transform_garage(df):
    df["Has_garage"]=np.where(df["Garage_Area"]==0, 0 , 1) #create new column, fill with 1 if house has Garage, else 0


In [11]:
transform_garage(X_train)

In [12]:

X_train['Has_garage'] = X_train['Has_garage'].astype(str)
fig=px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price", color="Has_garage", width=700,marginal_y="box")
fig.show()
X_train['Has_garage'] = X_train['Has_garage'].astype(float)
X_train['Has_garage'].value_counts()

1.0    2074
0.0     123
Name: Has_garage, dtype: int64

In [13]:
'''
Encode feature Central_Air, put 1 if Central_Air is present else 0
'''
def transform_CentralAir(df):
    df["Central_Air"]=np.where(df["Central_Air"].str.contains('N'), 0 , 1) 


In [14]:
transform_CentralAir(X_train)

In [15]:
X_train['Central_Air'] = X_train['Central_Air'].astype(str)
fig=px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price", color="Central_Air", width=700,marginal_y="box")
fig.show()
X_train['Central_Air'] = X_train['Central_Air'].astype(float)

X_train['Central_Air'].value_counts()

1.0    2045
0.0     152
Name: Central_Air, dtype: int64

In [16]:
def transform_Alley(df):
    df['Has_Alley'] = np.where(df['Alley'] == 'No_Alley_Access' , 0 , 1)

In [17]:
transform_Alley(X_train)

In [18]:
X_train['Has_Alley'] = X_train['Has_Alley'].astype(str)
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Has_Alley",marginal_y='box', height=600, width=1200)
fig.show()
X_train['Has_Alley'] = X_train['Has_Alley'].astype(float)
X_train['Has_Alley'].value_counts()

0.0    2043
1.0     154
Name: Has_Alley, dtype: int64

In [19]:
def transform_Basement(df):
    df['Has_Basement'] = np.where(df['Bsmt_Cond'] == 'No_Basement' , 0 , 1)

In [20]:
transform_Basement(X_train)

In [21]:
X_train['Has_Basement'] = X_train['Has_Basement'].astype(str)
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Has_Basement",marginal_y='box', height=600, width=1200)
fig.show()
X_train['Has_Basement'] = X_train['Has_Basement'].astype(float)
X_train['Has_Basement'].value_counts()

1.0    2142
0.0      55
Name: Has_Basement, dtype: int64

In [22]:
def transform_Fireplace(df):
    df['Has_Fireplace'] = np.where(df['Fireplaces'] == 0 , 0 , 1)
    

In [23]:
transform_Fireplace(X_train)

In [24]:
X_train['Has_Fireplace'] = X_train['Has_Fireplace'].astype(str)
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Has_Fireplace",marginal_y='box', trendline='ols',height=600, width=1200)
fig.show()
X_train['Has_Fireplace'] = X_train['Has_Fireplace'].astype(float)
X_train['Has_Fireplace'].value_counts()

1.0    1147
0.0    1050
Name: Has_Fireplace, dtype: int64

In [25]:
def transform_Fence(df):
    df['Has_Fence'] = np.where(df['Fence'] == 'No_Fence' , 0 , 1)
    

In [26]:
transform_Fence(X_train)

In [27]:
X_train['Has_Fence'] = X_train['Has_Fence'].astype(str)
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Has_Fence",marginal_y='box', trendline='ols',height=600, width=1200)
fig.show()
X_train['Has_Fence'] = X_train['Has_Fence'].astype(float)
X_train['Has_Fence'].value_counts()

0.0    1771
1.0     426
Name: Has_Fence, dtype: int64

In [28]:
def transform_Remodeled(df):
    df['Is_Remodeled'] = np.where(df['Year_Remod_Add'] == df['Year_Built'], 0, 1)
    df['House_Age'] = df['Year_Sold'] - df['Year_Built'] #fa riferimento all'età della casa quando è stata acquistata
    #df.drop('Year_Built', inplace=True)

In [29]:
transform_Remodeled(X_train)

In [30]:
X_train['Is_Remodelled'] = X_train['Is_Remodelled'].astype(str)
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Is_Remodelled", marginal_y='box',trendline='ols', height=600, width=1200)
fig.show()
X_train['Is_Remodelled'] = X_train['Is_Remodelled'].astype(float)

KeyError: 'Is_Remodelled'

In [None]:
X_train['Is_Remodelled'] = X_train['Is_Remodelled'].astype(str)
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Is_Remodelled",facet_col="Overall_Qual" , facet_col_wrap=3,trendline='ols',height=1600, width=1200)
fig.show()
X_train['Is_Remodelled'] = X_train['Is_Remodelled'].astype(float)

Sembra che il fatto che una casa sia stata restrutturata non abbia un impatto così significativo sul prezzo di vendita della casa stessa

In [None]:
'''
Create new feature that indicate the house's total Square Feet
'''
def transform_SF(df):
    SF_features=[x for x in df.columns if 'SF' in x] 
    df['Total_SF'] = df[SF_features].sum(axis=1) 

transform_SF(X_train)

In [None]:
fig=px.box(X_train, x="Neighborhood", y="Sale_Price",color="Neighborhood", height=800, width=1500)
fig.show()

X_train['Neighborhood'].value_counts()

North_Ames                                 340
College_Creek                              196
Old_Town                                   187
Edwards                                    147
Somerset                                   129
Gilbert                                    117
Northridge_Heights                         117
Sawyer                                     106
Northwest_Ames                             104
Sawyer_West                                 95
Brookside                                   83
Mitchell                                    83
Crawford                                    79
Iowa_DOT_and_Rail_Road                      71
Timberland                                  55
Northridge                                  53
South_and_West_of_Iowa_State_University     39
Stone_Brook                                 37
Clear_Creek                                 35
Meadow_Village                              28
Bloomington_Heights                         24
Veenker      

In [None]:
fig=px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="House_Style", height=1000, width=1000, facet_row="House_Style")
fig.show()
X_train['House_Style'].value_counts()

One_Story           1109
Two_Story            638
One_and_Half_Fin     248
SLvl                  98
SFoyer                64
Two_and_Half_Unf      18
One_and_Half_Unf      16
Two_and_Half_Fin       6
Name: House_Style, dtype: int64

In [None]:
fig=px.box(X_train, x="Overall_Qual", y="Sale_Price",color="Overall_Qual", height=800, width=1200, points="all",
                 category_orders={"Overall_Qual":["Very_Poor", "Poor", "Fair", "Below_Average","Average","Above_Average","Good","Very_Good","Excellent","Very_Excellent" ]})
fig.show()

fig1 =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Overall_Qual", height=600, width=1000)
fig1.show()

X_train['Overall_Qual'].value_counts()

Average           624
Above_Average     574
Good              436
Very_Good         261
Below_Average     167
Excellent          71
Fair               25
Very_Excellent     23
Poor               12
Very_Poor           4
Name: Overall_Qual, dtype: int64

In [None]:
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="MS_Zoning",marginal_y='box', height=600, width=1200)
fig.show()
X_train['MS_Zoning'].value_counts()

Residential_Low_Density         1695
Residential_Medium_Density       364
Floating_Village_Residential      97
Residential_High_Density          21
C_all                             16
A_agr                              2
I_all                              2
Name: MS_Zoning, dtype: int64

In [None]:
fig =px.scatter(X_train, x="Gr_Liv_Area", y="Sale_Price",color="Condition_1",facet_col='Condition_1', facet_col_wrap=3, trendline='ols',height=1000, width=1000)
fig.show()
(X_train['Condition_1'] + X_train['Condition_1']).value_counts()

NormNorm        1874
FeedrFeedr       137
ArteryArtery      75
RRAnRRAn          37
PosNPosN          28
RRAeRRAe          20
PosAPosA          15
RRNnRRNn           6
RRNeRRNe           5
Name: Condition_1, dtype: int64

In [32]:
X_train.to_csv('dataset/train_data_FE.csv', index=False)

In [33]:
X_train_FE = pd.read_csv('dataset/train_data_FE.csv')

In [None]:
def save_session() :
    dill.dump_session('notebook_env.db')

def load_session():
    dill.load_session('notebook_env.db')

In [None]:
#save_session()

In [None]:
#load_session()