In [212]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import pickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import r_regression,f_classif
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

## Loading and splitting data

In [213]:
df=pd.read_csv("../data/train.csv")
df_train=df.drop("SalePrice",axis=1)
dftest=pd.read_csv("../data/test.csv")
df_result=pd.read_csv("../data/sample_submission.csv")

In [214]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]

In [215]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.30,random_state=42)

## Divide features by type

In [216]:
def divide_by_type(df):  
    categorical_features=[features for features in df.columns if df[features].dtype=="O"]
    numerical_features=[features for features in df.columns if df[features].dtype!="O"]
    date_features=[features for features in df.columns if "Yr" in features
                   or "Year" in features 
                   or "Mo" in features]
    features=[]
    for feature in numerical_features:
        if feature not in date_features:
            features.append(feature)
    numerical_features=features
    return categorical_features,numerical_features,date_features

categorical_features,numerical_features,date_features=divide_by_type(xtrain)

## Divide Ordinal features

In [217]:
def divide_ordinal_features(df):    
    ordinal_numerical_features=df[numerical_features].max()[df[numerical_features].max() <= 14].index.tolist()
    ordinal_features=[features for features in df.columns if re.search('Qu$',features)
                      or re.search('QC',features)
                      or re.search('Qual$',features)
                      or re.search('Cond$',features)]
    
    ordinal_categorical_features=[features for features in ordinal_features if df[features].dtype=="O"]
    return ordinal_features, ordinal_numerical_features, ordinal_categorical_features

ordinal_features, ordinal_numerical_features, ordinal_categorical_features=divide_ordinal_features(df)

In [218]:
def update_categorical_and_numerical_features(numerical_features,
                                              categorical_features,
                                              features_to_remove):
    
    update_numerical = []
    for feature in numerical_features:
        if feature not in (features_to_remove):
            update_numerical.append(feature)

    update_categorical = []
    for feature in categorical_features:
        if feature not in features_to_remove:
            update_categorical.append(feature)
    
    update_numerical.remove("MSSubClass")
    update_categorical.append("MSSubClass")
    return update_numerical,update_categorical


features_to_remove = ordinal_categorical_features+ordinal_numerical_features
numerical_features,categorical_features = update_categorical_and_numerical_features(numerical_features,
                                                                                    categorical_features,
                                                                                   features_to_remove)

## Preprocessing Numerical features

In [219]:
def fill_numerical_missing_values(df,numerical_features):
    df_numerical=df[numerical_features ].fillna(0)
    return df_numerical

In [220]:
def fit_scaler_min_max(df,numerical_features):
    scaler=MinMaxScaler()
    scaler.fit(df[numerical_features])
    return scaler

In [221]:
with open('../models/MinMax_scaler.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_scaler_min_max(df,numerical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [222]:
def transform_scaler_min_max(df,scaler):
    df[numerical_features] = scaler.transform(df[numerical_features])
    return df

In [223]:
def pipeline_train_numerical_features(df,numerical_features):
    
    df[numerical_features]=fill_numerical_missing_values(df,numerical_features)
    scaler=fit_scaler_min_max(df,numerical_features)
    df=transform_scaler_min_max(df,scaler)
    return df
    
pipeline_train_numerical_features(xtrain,numerical_features)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
135,0.092529,20,RL,0.255591,0.042534,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,MnPrv,,0.0,5,2008,WD,Normal
1452,0.995202,180,RM,0.111821,0.011101,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,,,0.0,5,2006,WD,Normal
762,0.522276,60,FV,0.230032,0.034308,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,,,0.0,6,2010,Con,Normal
932,0.638794,20,RL,0.268371,0.048470,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,3,2007,WD,Normal
435,0.298149,60,RL,0.137380,0.043782,Pave,,IR2,Lvl,AllPub,...,0.0,0.0,,,,0.0,4,2009,ConLw,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.750514,20,RL,0.249201,0.037472,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,,,,0.0,3,2007,WD,Normal
1130,0.774503,50,RL,0.207668,0.030400,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,MnPrv,,0.0,12,2009,WD,Normal
1294,0.886909,20,RL,0.191693,0.032120,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,,,0.0,4,2006,WD,Normal
860,0.589445,50,RL,0.175719,0.029643,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,,GdPrv,,0.0,6,2007,WD,Normal


## Preprocessing Categorical features

In [224]:
def fill_missing_categorical_values(df,categorical_features):
    
    for feature in categorical_features:
        if df[feature].isnull().sum()==1: 
            df[feature]=df[feature].fillna(df[feature].mode())
        else:
            df[feature]=df[feature].fillna("Missing")
    return df[categorical_features]

In [225]:
def fit_one_hot_encoding(df,categorical_feature):
    
    enc=OneHotEncoder(handle_unknown="ignore",sparse=False)

    enc.fit(df[categorical_features])
    return enc

In [226]:
with open('../models/OneHotEncoder.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_one_hot_encoding(df,categorical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [227]:
def transform_one_hot(df,enc):
    values=enc.transform(df[categorical_features])
    names=enc.get_feature_names_out(df[categorical_features].columns)
    df1=pd.DataFrame(columns=names)
    df=pd.concat([df,df1],axis=1)
    df[names]=values
    df=df.drop(categorical_features,axis=1)
    return df

In [228]:
def pipeline_categorical_feature(df,categorical_features):
    
    df[categorical_features]=fill_missing_categorical_values(df,categorical_features)
    enc=fit_one_hot_encoding(df,categorical_features)
    df=transform_one_hot(df,enc)
    return df
    
pipeline_categorical_feature(xtrain,categorical_features)

80
228
34
274


Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190
135,0.092529,0.255591,0.042534,7,6,1970,1970,0.208999,TA,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1452,0.995202,0.111821,0.011101,5,5,2005,2005,0.058055,TA,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
762,0.522276,0.230032,0.034308,7,5,2009,2009,0.000000,TA,TA,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
932,0.638794,0.268371,0.048470,9,5,2006,2006,0.219158,Ex,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,0.298149,0.137380,0.043782,7,6,1996,1996,0.000000,Gd,TA,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.750514,0.249201,0.037472,6,5,2006,2006,0.000000,Gd,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1130,0.774503,0.207668,0.030400,4,3,1928,1950,0.000000,TA,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,0.886909,0.191693,0.032120,5,7,1955,1990,0.000000,TA,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.589445,0.175719,0.029643,7,8,1918,1998,0.000000,Gd,TA,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
xtrain.shape

(1022, 80)

## Outliers for future testing

In [13]:
dict_outlier={}
for feature in numerical_features:
    # calculate summary statistics
    data_mean, data_std = xtrain[feature].mean(), xtrain[feature].std()
    # define outliers
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    dict_outlier[feature]=xtrain[(xtrain[feature] < lower) | (xtrain[feature] > upper) ].index.tolist()


## Feature Selection

### Numerical Features 

In [14]:
corr=r_regression(xtrain[numerical_features],ytrain)
result=np.where(corr > 0.5)[0].tolist()

In [15]:
numerical_features

['Id',
 'LotFrontage',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal']

In [16]:
selected_numerical_features=[]
for feature in numerical_features:
    print(feature)
    if numerical_features.index(feature) in result:
        selected_numerical_features.append(feature)
selected_numerical_features

Id
LotFrontage
LotArea
MasVnrArea
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
GarageArea
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal


['TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'GarageArea']

For ordinal_numerical_features we could have used the spearman correlation because it'is based on ranks

In [17]:
corr=r_regression(xtrain[ordinal_numerical_features],ytrain)
result1=np.where(corr > 0.5)[0].tolist()


In [18]:
selected_numerical_ordinal_features=[]
for feature in ordinal_numerical_features:
    if ordinal_numerical_features.index(feature) in result1:
        selected_numerical_ordinal_features.append(feature)
selected_numerical_ordinal_features

['OverallQual', 'FullBath', 'TotRmsAbvGrd', 'GarageCars']

In [19]:
corr=r_regression(xtrain[date_features],ytrain)
result2=np.where(corr > 0.5)[0].tolist()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
selected_date_features=[]
for feature in date_features:
    if date_features.index(feature) in result2:
        selected_date_features.append(feature)
selected_date_features

## Categorical Features

In [None]:
dict_enc1={}
for feature in ordinal_categorical_features:
    enc = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=100)
    enc.fit(xtrain[[feature]])
    dict_enc1[feature]=enc


In [None]:
dict_enc={}
for feature in categorical_features:
    enc = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=100)
    enc.fit(xtrain[[feature]])
    dict_enc[feature]=enc

In [None]:
def transform_cat(df1):
    list_name=[]
    for feature,enc in dict_enc.items():
        x=enc.transform(df1[[feature]])
        df1[feature]=x
        
    return df1
cat=transform_cat(xtrain)

In [None]:
cat["ExterCond"]

In [None]:
def transform_ord(df1):
    for feature,enc1 in dict_enc1.items():

        x=enc1.transform(df1[[feature]])
        df1[feature]=x
    return df1
ordinal=transform_ord(xtrain)


In [None]:
xtrain[ordinal_categorical_features]


In [None]:
xtrain[categorical_features].isnull().sum()

## Feature Engineering

In [None]:
classif=f_classif(cat[categorical_features],ytrain)
result2=np.where(classif[1] < 0.007)[0].tolist()
select_categ=[]
for feature in categorical_features:
    if categorical_features.index(feature) in result2:
        select_categ.append(feature)
select_categ


In [None]:
classif=f_classif(xtrain[ordinal_categorical_features],ytrain)

result3=np.where(classif[1] < 0.001)[0].tolist()
select_ord=[]
for feature in ordinal_categorical_features:
    if ordinal_categorical_features.index(feature) in result3:
        select_ord.append(feature)
select_ord

In [None]:
selected_features=["Neighborhood"]+["Utilities"]+selected_date_features+selected_numerical_features+selected_numerical_ordinal_features

In [None]:
selected_features=["Neighborhood","YearBuilt","OverallQual","GarageArea","GrLivArea","TotalBsmtSF","Utilities"]

In [None]:
xtrain=xtrain[selected_features]

In [None]:
scaler=MinMaxScaler()
scaler.fit(xtrain)
xtrain.loc[:,selected_features]=scaler.transform(xtrain)

In [None]:
xtrain[(xtrain["OverallQual"]==1) | (xtrain["OverallQual"]==0)].index

In [None]:
le=LinearRegression()
le.fit(xtrain,ytrain)

## Test Preprocessing

In [None]:
xtest=delete_columns(xtest)

In [None]:

categorical_features=[features for features in xtest.columns if xtest[features].dtype=="O"]
numerical_features=[features for features in xtest.columns if xtest[features].dtype!="O"]
xtest[categorical_features]=xtest[categorical_features].fillna("Missing")
xtest[numerical_features]=xtest[numerical_features].fillna(0)
xtest=transform_cat(xtest)
xtest=transform_ord(xtest)
xtest=xtest[selected_features]
xtest.shape
x=scaler.transform(xtest)
# xtest


In [None]:
xtest[selected_features]=x
ypred=le.predict(xtest)

In [None]:
le.score(xtest,ytest)

In [None]:
ids=dftest["Id"]
dftest=delete_columns(dftest)
categorical_features=[features for features in dftest.columns if dftest[features].dtype=="O"]
numerical_features=[features for features in dftest.columns if dftest[features].dtype!="O"]
dftest[categorical_features]=dftest[categorical_features].fillna("Missing")
dftest[numerical_features]=dftest[numerical_features].fillna(0)
dftest=transform_cat(dftest)
dftest=transform_ord(dftest)
dftest=dftest[selected_features]
dftest[selected_features]=scaler.transform(dftest)






In [None]:
df_result["Id"]=ids
df_result["SalePrice"]=le.predict(dftest)

In [None]:
df_result.to_csv("../data/result1.csv",index=False)

In [None]:
dftest

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)
compute_rmsle(ytest,ypred)