In [36]:

import re
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_log_error
pd.options.mode.chained_assignment = None


def split_train_test(df):

    X = df.iloc[:, :-1]
    Y = df.iloc[:, -1]
    xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.30,
                                                    random_state=42)
    return xtrain, xtest, ytrain, ytest


def feature_selection(df):

    feature_selected = ["Neighborhood",
                        "LotArea",
                        "Utilities",
                        "OverallQual",
                        "YearBuilt",
                        "GrLivArea",
                        "ExterCond",
                        "1stFlrSF",
                        "TotRmsAbvGrd",
                        "KitchenQual"]

    df = df[feature_selected]
    return df


def divide_by_type(df):

    cat_features = [features for features in df.columns
                    if df[features].dtype == "O"]
    num_features = [features for features in df.columns
                    if df[features].dtype != "O"]
    date_features = [features for features in df.columns if "Yr" in features
                     or "Year" in features
                     or "Mo" in features]
    features = []
    for feature in num_features:
        if feature not in date_features:
            features.append(feature)
    num_features = features
    return cat_features, num_features, date_features


def divide_ord_features(df, cat_features, num_features):

    num_max = df[num_features].max()
    ord_num_features = num_max[num_max <= 15].index.tolist()
    ordinal_features = [features for features in df.columns
                        if re.search('Qu$', features)
                        or re.search('QC', features)
                        or re.search('Qual$', features)
                        or re.search('Cond$', features)]

    ord_cat_features = [features for features in ordinal_features
                        if df[features].dtype == "O"]
    return (ordinal_features,
            ord_num_features,
            ord_cat_features)


def update_cat_and_num_features(num_features,
                                cat_features,
                                features_to_rmv):

    update_numerical = []
    for feature in num_features:
        if feature not in (features_to_rmv):
            update_numerical.append(feature)

    update_categorical = []
    for feature in cat_features:
        if feature not in features_to_rmv:
            update_categorical.append(feature)

    return update_numerical, update_categorical


def fill_numerical_missing_values(df, num_features):
    df_numerical = df[num_features].fillna(0)
    return df_numerical


def fit_scaler_min_max(df, num_features):
    scaler = MinMaxScaler()
    scaler.fit(df[num_features])
    pickle.dump(scaler, open('../models/MinMax_Numerical_scaler.pickle', 'wb'))
    return scaler


def transform_scaler_min_max(df, num_features):

    scaler = pickle.load(open('../models/MinMax_Numerical_scaler.pickle',
                              'rb'))
    df[num_features] = scaler.transform(df[num_features])

    return df


def fill_missing_categorical_values(df, cat_features):

    for feature in cat_features:
        if df[feature].isnull().sum() == 1:
            df[feature] = df[feature].fillna(df[feature].mode())
        else:
            df[feature] = df[feature].fillna("Missing")
    return df[cat_features]


def fit_one_hot_encoding(df, cat_features):

    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    enc.fit(df[cat_features])
    pickle.dump(enc, open('../models/One_Hot_Encoder.pickle', 'wb'))

    return enc


def transform_one_hot(df, cat_features):

    enc = pickle.load(open('../models/One_Hot_Encoder.pickle', 'rb'))
    values = enc.transform(df[cat_features])
    names = enc.get_feature_names_out(df[cat_features].columns)
    df1 = pd.DataFrame(columns=names)
    df = pd.concat([df, df1], axis=1)
    df[names] = values
    df = df.drop(cat_features, axis=1)
    return df


def fill_missing_ord_num_values(df, ord_num_features):

    df_num_ordinal = df[ord_num_features]
    if np.sum(df_num_ordinal.isnull().sum() > 0):
        df_num_ordinal = df_num_ordinal.fillna(0)

    return df_num_ordinal


def fit_scaler_ordinal_numerical(df, ord_num_features):

    file = '../models/MinMax_Ordinal_Numerical_scaler.pickle'
    scaler = MinMaxScaler()
    scaler.fit(df[ord_num_features])
    pickle.dump(scaler, open(
                              file,
                              'wb'))
    return scaler


def transform_ordina_num_features(df, ord_num_features):

    file = '../models/MinMax_Ordinal_Numerical_scaler.pickle'
    df_num_ord = df[ord_num_features]
    scaler = pickle.load(open(file,  mode='rb'))
    df[ord_num_features] = (scaler.transform(
                                                        df_num_ord
                                                        ))
    return df


def fill_missing_ord_cat_values(df, ord_cat_features):

    for feature in ord_cat_features:
        if df[feature].isnull().sum() == 1:
            df[feature] = df[feature].fillna(df[feature].mode())
        else:
            df[feature] = df[feature].fillna("Missing")
    return df[ord_cat_features]


def fit_ordinal_categorical(df, ord_cat_features):

    enc = OrdinalEncoder(
                       handle_unknown="use_encoded_value",
                       unknown_value=6
    )
    enc.fit(df[ord_cat_features])
    pickle.dump(enc,  open('../models/Ordinal_Encoder.pickle', "wb"))
    return enc


def transform_ord_cat_features(df, ord_cat_features):

    df_cat_ord = df[ord_cat_features]
    enc = pickle.load(open('../models/Ordinal_Encoder.pickle', "rb"))
    df[ord_cat_features] = enc.transform(df_cat_ord)
    return df


def fit_scaler_ordinal_categorical(df, ord_cat_features):

    scaler = MinMaxScaler()
    file = '../models/MinMax_Ordinal_Categorical_scaler.pickle'
    scaler.fit(df[ord_cat_features])
    pickle.dump(scaler,  open(file,  mode='wb'))
    return scaler


def transform_scaler_ordinal_categorical(df, ord_cat_features):

    df_cat_ord = df[ord_cat_features]
    file = '../models/MinMax_Ordinal_Categorical_scaler.pickle'
    scaler = pickle.load(open(file,  mode='rb'))
    df[ord_cat_features] = scaler.transform(df_cat_ord)
    return df


def fill_missing_dates_values(df, date_features):

    for features in date_features:
        df[features] = df[features].fillna(df[features].mode())

    return df[date_features]


def fit_scaler_dates(df, date_features):

    scaler = MinMaxScaler()
    file = '../models/MinMax_dates_scaler.pickle'
    scaler.fit(df[date_features])
    pickle.dump(scaler,  open(file,  mode='wb'))
    return scaler


def transform_dates(df, date_features):

    file = '../models/MinMax_dates_scaler.pickle'
    scaler = pickle.load(open(file,  mode='rb'))
    df[date_features] = scaler.transform(df[date_features])
    return df


def compute_rmsle(y_test, y_pred, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test,  y_pred))
    return round(rmsle,  precision)


def fill_missing_values(df,
                        num_features,
                        cat_features,
                        ord_num_features,
                        ord_cat_features,
                        date_features):
                    
    df[num_features] = fill_numerical_missing_values(df, num_features)
    df[cat_features] = fill_missing_categorical_values(df, cat_features)
    df[ord_num_features] = fill_missing_ord_num_values(df, ord_num_features)
    df[ord_cat_features] = fill_missing_ord_cat_values(df, ord_cat_features)
    df[date_features] = fill_missing_dates_values(df, date_features)
    return df


def fit_scaler_features(df,
                        num_features,
                        ord_num_features,
                        date_features,
                        ord_cat_features):
    
    fit_scaler_min_max(df, num_features)
    fit_scaler_ordinal_numerical(df, ord_num_features)
    fit_scaler_dates(df, date_features)
    fit_scaler_ordinal_categorical(df, ord_cat_features)


def fit_encode(df,
               cat_features,
               ord_cat_features):
    
    fit_one_hot_encoding(df, cat_features)
    fit_ordinal_categorical(df, ord_cat_features)


def transform_encode(df,
                     cat_features,
                     ord_cat_features):

    df = transform_one_hot(df, cat_features)
    df = transform_ord_cat_features(df, ord_cat_features)
    return df


def transform_scaler(df,
                     num_features,
                     ord_num_features,
                     date_features):
    
     df = transform_scaler_min_max(df, num_features)
     df = transform_ordina_num_features(df, ord_num_features)
     df = transform_dates(df, date_features)
     return df
    

def splitting_types(df):

    df = feature_selection(df)
    cat_features, num_features, date_features = divide_by_type(df)
    (ordinal_features,
     ord_num_features,
     ord_cat_features) = divide_ord_features(df, cat_features, num_features)
    features_to_rmv = ord_cat_features+ord_num_features
    num_features, cat_features = update_cat_and_num_features(num_features,
                                                             cat_features,
                                                             features_to_rmv)
    return (num_features,
            cat_features,
            ordinal_features,
            ord_num_features,
            ord_cat_features,
            date_features,
            df)


def pipeline(df):
    
    if "SalePrice" in df.columns:
        xtrain, xtest, ytrain, ytest = split_train_test(df)
        (num_features,
         cat_features,
         ordinal_features,
         ord_num_features,
         ord_cat_features,
         date_features,
         xtrain) = splitting_types(xtrain)
        
        xtrain = fill_missing_values(xtrain,
                        num_features,
                        cat_features,
                        ord_num_features,
                        ord_cat_features,
                        date_features)
        fit_encode(xtrain,
                   cat_features,
                   ord_cat_features)
        
        xtrain = transform_encode(xtrain,
                                  cat_features,
                                  ord_cat_features)
        
        fit_scaler_features(xtrain,
                            num_features,
                            ord_num_features,
                            date_features,
                            ord_cat_features)

        xtrain = transform_scaler(xtrain,
                                  num_features,
                                  ord_num_features,
                                  date_features)

        xtrain = transform_scaler_ordinal_categorical(xtrain, ord_cat_features)

        return xtrain, xtest, ytrain, ytest

    else:

         (num_features,
         cat_features,
         ordinal_features,
         ord_num_features,
         ord_cat_features,
         date_features,
         df) = splitting_types(df)

         df = fill_missing_values(df,
                        num_features,
                        cat_features,
                        ord_num_features,
                        ord_cat_features,
                        date_features)

         df = transform_encode(df,
                              cat_features,
                              ord_cat_features)

         df = transform_scaler(df,
                              num_features,
                              ord_num_features,
                              date_features)

         return df

In [37]:
pipeline(df)

(       LotArea  OverallQual  YearBuilt  GrLivArea  ExterCond  1stFlrSF  \
 135   0.042534     0.666667   0.710145   0.253956        1.0  0.309316   
 1452  0.011101     0.444444   0.963768   0.139035        1.0  0.169344   
 762   0.034308     0.666667   0.992754   0.228523        1.0  0.098669   
 932   0.048470     0.888889   0.971014   0.295968        1.0  0.360486   
 435   0.043782     0.666667   0.898551   0.250000        1.0  0.113125   
 ...        ...          ...        ...        ...        ...       ...   
 1095  0.037472     0.555556   0.971014   0.184627        1.0  0.224874   
 1130  0.030400     0.333333   0.405797   0.310286        1.0  0.228086   
 1294  0.032120     0.444444   0.601449   0.099849        1.0  0.121615   
 860   0.029643     0.666667   0.333333   0.205727        1.0  0.132630   
 1126  0.011143     0.666667   0.978261   0.230030        1.0  0.280174   
 
       TotRmsAbvGrd  KitchenQual  Neighborhood_Blmngtn  Neighborhood_Blueste  \
 135       0.41666