In [383]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import pickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import r_regression,f_classif
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

## Loading and Splitting Data

In [384]:
df=pd.read_csv("../data/train.csv")
df_train=df.drop("SalePrice",axis=1)
dftest=pd.read_csv("../data/test.csv")
df_result=pd.read_csv("../data/sample_submission.csv")

In [385]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]

In [386]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.30,random_state=42)

In [387]:
xtrain

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
135,136,20,RL,80.0,10400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,5,2008,WD,Normal
1452,1453,180,RM,35.0,3675,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2006,WD,Normal
762,763,60,FV,72.0,8640,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2010,Con,Normal
932,933,20,RL,84.0,11670,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2007,WD,Normal
435,436,60,RL,43.0,10667,Pave,,IR2,Lvl,AllPub,...,0,0,,,,0,4,2009,ConLw,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1096,20,RL,78.0,9317,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2007,WD,Normal
1130,1131,50,RL,65.0,7804,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,12,2009,WD,Normal
1294,1295,20,RL,60.0,8172,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Normal
860,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,,0,6,2007,WD,Normal


## Divide Features By Type

In [388]:
def divide_by_type(df):  
    categorical_features=[features for features in df.columns if df[features].dtype=="O"]
    numerical_features=[features for features in df.columns if df[features].dtype!="O"]
    date_features=[features for features in df.columns if "Yr" in features
                   or "Year" in features 
                   or "Mo" in features]
    features=[]
    for feature in numerical_features:
        if feature not in date_features:
            features.append(feature)
    numerical_features=features
    return categorical_features,numerical_features,date_features

categorical_features,numerical_features,date_features=divide_by_type(xtrain)

## Divide Ordinal Features

In [389]:
def divide_ordinal_features(df):    
    ordinal_numerical_features=df[numerical_features].max()[df[numerical_features].max() <= 14].index.tolist()
    ordinal_features=[features for features in df.columns if re.search('Qu$',features)
                      or re.search('QC',features)
                      or re.search('Qual$',features)
                      or re.search('Cond$',features)]
    
    ordinal_categorical_features=[features for features in ordinal_features if df[features].dtype=="O"]
    return ordinal_features, ordinal_numerical_features, ordinal_categorical_features

ordinal_features, ordinal_numerical_features, ordinal_categorical_features=divide_ordinal_features(df)

In [390]:
def update_categorical_and_numerical_features(numerical_features,
                                              categorical_features,
                                              features_to_remove):
    
    update_numerical = []
    for feature in numerical_features:
        if feature not in (features_to_remove):
            update_numerical.append(feature)

    update_categorical = []
    for feature in categorical_features:
        if feature not in features_to_remove:
            update_categorical.append(feature)
    
    update_numerical.remove("MSSubClass")
    update_categorical.append("MSSubClass")
    return update_numerical,update_categorical


features_to_remove = ordinal_categorical_features+ordinal_numerical_features
numerical_features,categorical_features = update_categorical_and_numerical_features(numerical_features,
                                                                                    categorical_features,
                                                                                   features_to_remove)

## Preprocessing Numerical Features

In [391]:
def fill_numerical_missing_values(df,numerical_features):
    df_numerical=df[numerical_features ].fillna(0)
    return df_numerical

In [392]:
def fit_scaler_min_max(df,numerical_features):
    scaler=MinMaxScaler()
    scaler.fit(df[numerical_features])
    return scaler

In [393]:
with open('../models/MinMax_Numerical_scaler.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_scaler_min_max(df,numerical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [394]:
def transform_scaler_min_max(df,scaler):
    df[numerical_features] = scaler.transform(df[numerical_features])
    return df

In [395]:
def pipeline_train_numerical_features(df,numerical_features):
    
    df[numerical_features]=fill_numerical_missing_values(df,numerical_features)
    scaler=fit_scaler_min_max(df,numerical_features)
    df=transform_scaler_min_max(df,scaler)
    return df
    
xtrain=pipeline_train_numerical_features(xtrain,numerical_features)

## Preprocessing Categorical Features

In [396]:
def fill_missing_categorical_values(df,categorical_features):
    
    for feature in categorical_features:
        if df[feature].isnull().sum()==1: 
            df[feature]=df[feature].fillna(df[feature].mode())
        else:
            df[feature]=df[feature].fillna("Missing")
    return df[categorical_features]

In [397]:
def fit_one_hot_encoding(df,categorical_feature):
    
    enc=OneHotEncoder(handle_unknown="ignore",sparse=False)

    enc.fit(df[categorical_features])
    return enc

In [398]:
with open('../models/One_Hot_Encoder.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_one_hot_encoding(df,categorical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [399]:
def transform_one_hot(df,enc):
    values=enc.transform(df[categorical_features])
    names=enc.get_feature_names_out(df[categorical_features].columns)
    df1=pd.DataFrame(columns=names)
    df=pd.concat([df,df1],axis=1)
    df[names]=values
    df=df.drop(categorical_features,axis=1)
    return df

In [400]:
def pipeline_categorical_feature(df,categorical_features):
    
    df[categorical_features]=fill_missing_categorical_values(df,categorical_features)
    enc=fit_one_hot_encoding(df,categorical_features)
    df=transform_one_hot(df,enc)
    return df
    
xtrain=pipeline_categorical_feature(xtrain,categorical_features)

## Preprocessing Ordinal Numerical Features

In [401]:
def fill_missing_ordinal_numericals_values(df,ordinal_numerical_features):
    if np.sum(df[ordinal_numerical_features].isnull().sum() > 0):
        df[ordinal_numerical_features]=df[ordinal_numerical_features].fillna(0)
    return df[ordinal_numerical_features]
    
    

In [402]:
def fit_scaler_ordinal_numerical(df,ordinal_numerical_features):
    scaler=MinMaxScaler()
    scaler.fit(df[ordinal_numerical_features])
    return scaler


In [403]:
with open('../models/MinMax_Ordinal_Numerical_scaler.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_scaler_min_max(df,ordinal_numerical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [404]:
def transform_ordina_numerical_features(df,scaler):
    df[ordinal_numerical_features]=scaler.transform(df[ordinal_numerical_features])
    return df

In [405]:
def pipeline_train_ordinal_numerical_features(df,ordinal_numerical_features):
    
    df[ordinal_numerical_features]=fill_missing_ordinal_numericals_values(df,ordinal_numerical_features)
    scaler=fit_scaler_ordinal_numerical(df,ordinal_numerical_features)
    df=transform_ordina_numerical_features(df,scaler)
    return df
    
xtrain=pipeline_train_ordinal_numerical_features(xtrain,ordinal_numerical_features)

## Preprocessing Ordinal Categorical Features

In [406]:
def fill_missing_ordinal_categorical_values(df,ordinal_categorical_features):
    
    for feature in ordinal_categorical_features:
        if df[feature].isnull().sum()==1: 
            df[feature]=df[feature].fillna(df[feature].mode())
        else:
            df[feature]=df[feature].fillna("Missing")
    return df[ordinal_categorical_features]

In [407]:
def fit_ordinal_categorical(df,ordinal_categorical_features):
    enc=OrdinalEncoder(
                       handle_unknown="use_encoded_value",
                       unknown_value=6
    )
    enc.fit(df[ordinal_categorical_features])
    return enc

In [408]:
with open('../models/Ordinal_Encoder.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_ordinal_categorical(df,ordinal_categorical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [409]:
def transform_ordinal_categorical_features(df,enc):
    df[ordinal_categorical_features]=enc.transform(df[ordinal_categorical_features])
    return df

In [410]:
def fit_scaler_ordinal_categorical(df,ordinal_categorical_features):
    scaler=MinMaxScaler()
    scaler.fit(df[ordinal_categorical_features])
    return scaler

In [411]:
with open('../models/MinMax_Ordinal_Categorical_scaler.pickle', mode='ab') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(fit_ordinal_categorical(df,ordinal_categorical_features), f, protocol=pickle.HIGHEST_PROTOCOL)

In [412]:
def transform_scaler_ordinal_categorical(df,scaler):
    df[ordinal_categorical_features]=scaler.transform(df[ordinal_categorical_features])
    return df

In [413]:
def pipeline_train_ordinal_categorical_features(df,ordinal_categorical_features):
    
    df[ordinal_categorical_features]=fill_missing_ordinal_categorical_values(df
                                                                             ,ordinal_categorical_features)
    enc=fit_ordinal_categorical(df,ordinal_categorical_features)
    df=transform_ordinal_categorical_features(df,enc)
    scaler=scale_ordinal_categorical(df,ordinal_categorical_features)
    df=transform_scaler_ordinal_categorical(df,scaler)
    return df
    
xtrain=pipeline_train_ordinal_categorical_features(xtrain,ordinal_categorical_features)

In [414]:
xtrain

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190
135,0.092529,0.255591,0.042534,0.666667,0.625,1970,1970,0.208999,1.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1452,0.995202,0.111821,0.011101,0.444444,0.500,2005,2005,0.058055,1.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
762,0.522276,0.230032,0.034308,0.666667,0.500,2009,2009,0.000000,1.000000,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
932,0.638794,0.268371,0.048470,0.888889,0.500,2006,2006,0.219158,0.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,0.298149,0.137380,0.043782,0.666667,0.625,1996,1996,0.000000,0.666667,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.750514,0.249201,0.037472,0.555556,0.500,2006,2006,0.000000,0.666667,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1130,0.774503,0.207668,0.030400,0.333333,0.250,1928,1950,0.000000,1.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,0.886909,0.191693,0.032120,0.444444,0.750,1955,1990,0.000000,1.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.589445,0.175719,0.029643,0.666667,0.875,1918,1998,0.000000,0.666667,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
