#### 1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

#### Import the CSV Data as Pandas DataFrame

In [2]:
# Load data
df = pd.read_csv('C:/Users/Prujith/Documents/AI/home-data-for-ml-course/train.csv')
# X = df.copy()
# y = X.pop('SalePrice')

#### Drop ID

In [3]:
df.drop("Id", axis = 1, inplace = True)

#### Shape of the dataset

In [4]:
print("df : " + str(df.shape))

df : (1460, 80)


#### 2. Seperate Numeric & Categorical features, Temporal (Dates)

In [5]:
# Separate temporal features
feature_with_year = []
for feature in df.columns:
    if "Yr" in feature or "Year" in feature:
        feature_with_year.append(feature)


In [6]:
# Separate numerical (Discrete, Continuous) and categorial features
categorical_features = []
numerical_features = []

for feature in df.columns:
    if feature not in feature_with_year:
        if df[feature].dtypes == "O":
            categorical_features.append(feature)
        else:
            numerical_features.append(feature)
            
print(len(feature_with_year), 'Temporal Features ', feature_with_year)
print("\n\n", len(numerical_features), "Numerical Features ", numerical_features)
print("\n\n",len(categorical_features),"Categorical Features ", categorical_features)

4 Temporal Features  ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']


 33 Numerical Features  ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'SalePrice']


 43 Categorical Features  ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQ

#### 2.1 Some numerical features (maybe discrete) are categories - 'int' to 'string'

In [7]:
df2 = df.copy()

In [8]:
# Some numerical features are actually really categories
df2 = df.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 150 : "SC150", 160 : "SC160", 180: "SC180", 190 : "SC190"},
                    "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 :"Apr", 5 : "May", 6 : "Jun", 7 : "Jul", 8 : "Aug", 9 : "Sep", 10 :"Oct", 11 : "Nov", 12 : "Dec"}
                      })

# Removing from Numeriacal & Discrete features
numerical_features.remove("MSSubClass")
numerical_features.remove("MoSold")

# Appending to Categorical Features
categorical_features.append("MSSubClass")
categorical_features.append("MoSold")

#### 2.2  Separate numericals further (Discrete, Continuous)

In [9]:
# 1. Separate numerical (Discrete, Continuous) based on type
int_dis_ft = []
continuous_features = []
for feature in numerical_features:
    if df[feature].dtype in ['int64'] and feature not in feature_with_year:
        int_dis_ft.append(feature)
    elif df[feature].dtype in ['float64'] and feature not in feature_with_year:
        continuous_features.append(feature)
            

In [10]:
print(len(int_dis_ft), 'Disct Features ', int_dis_ft)
print("\n\n", len(continuous_features), "Cont Features ", continuous_features)

29 Disct Features  ['LotArea', 'OverallQual', 'OverallCond', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']


 2 Cont Features  ['LotFrontage', 'MasVnrArea']


In [11]:
# 2. Finally further seperating Continuous from integer Discrete
discrete_features = []
continuous_features = []

for feature in int_dis_ft:
    if len(df[feature].unique()) <= 20 and feature not in feature_with_year:
        discrete_features.append(feature)
    else:
        continuous_features.append(feature)
            
print(len(discrete_features), "Discrete Features ", discrete_features)
print("\n\n",len(continuous_features), "Continuous Features ", continuous_features)

13 Discrete Features  ['OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', '3SsnPorch', 'PoolArea']


 16 Continuous Features  ['LotArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'MiscVal', 'SalePrice']


### 3. Temporal features

#### Handle Temporal Features so that it will belong to Numeric or Categorical Features

In [12]:
# For temporal variables, we will change years to "age" meaning how many years old.
def handle_temporal_variable(X):
    for feature in feature_with_year:
        if feature != "YrSold":
            X[feature] = X["YrSold"] - X[feature]
    return X

# X_train = handle_temporal_variable(X_train) # Handled Temporal Variables

In [13]:
df3 = df2.copy()

In [14]:
df3 = handle_temporal_variable(df3)

## Handle Missing Values: Part 1 - Description based filling Missing Values

In [15]:
# a)
# Numerical Features with description mentioned NA's
num_with_Desc_Na = ["BedroomAbvGr","BsmtFullBath","BsmtHalfBath","BsmtUnfSF","EnclosedPorch","Fireplaces","GarageArea","GarageCars","HalfBath","KitchenAbvGr","LotFrontage","MasVnrArea","MiscVal","OpenPorchSF","PoolArea","ScreenPorch","TotRmsAbvGrd","WoodDeckSF"]
# b)
# Similarly categorical Features with description mentioned NA's
cat_with_Desc_Na = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Fence','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','MiscFeature','PoolQC']
# b.1) "Alley" - "None", "Functional" - "Typ"
# c) Ordinals from
# c.1) cat_with_Desc_Na = ["BsmtExposure","BsmtFinType1","BsmtFinType2","BsmtCond","BsmtQual","FireplaceQu","GarageCond","GarageQual","PoolQC"]
# c.2) excluding cat_with_Desc_Na ["Alley","ExterCond","ExterQual","HeatingQC","KitchenQual","Functional","LandSlope","LotShape","Street","Utilities","PavedDrive"]

# c)
Ordinals = ["BsmtExposure","BsmtFinType1","BsmtFinType2","BsmtCond","BsmtQual","FireplaceQu","GarageCond","GarageQual","PoolQC","Alley","ExterCond","ExterQual","HeatingQC","KitchenQual","Functional","LandSlope","LotShape","Street","Utilities","PavedDrive"]


#### Numerical Features - Filling zero's because description says so (Explicitly or Implicitly)

In [16]:
df4 = df3.copy()

In [17]:
df4[num_with_Desc_Na].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4[num_with_Desc_Na].fillna(0, inplace=True)


In [18]:
df5 = df4.copy()

In [19]:
# Categorical
# Replace "Na" to "No","None",etc of Desc_for_Na because description says so (Explicitly or Implicitly)

df5[cat_with_Desc_Na].fillna("No", inplace=True)

# Alley : data description says NA means "no alley access"
df5.loc[:, "Alley"] = df5.loc[:, "Alley"].fillna("None")

# Functional : data description says NA means typical
df5.loc[:, "Functional"] = df5.loc[:, "Functional"].fillna("Typ")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5[cat_with_Desc_Na].fillna("No", inplace=True)


## Encoding Ordinal Features

In [20]:
df6 = df5.copy()

In [21]:
df6 = df6.replace({"Alley" : {"None" : 0, "Grvl" : 1, "Pave" : 2},
                   "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                   "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                   "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                 "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                  "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, "Min2" : 6, "Min1" : 7, "Typ" : 8},
                   "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                    "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                      "Street" : {"Grvl" : 1, "Pave" : 2},
                   "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4},

                "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
                "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, "ALQ" : 5, "GLQ" : 6},
                    "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                    "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                 "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                  "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                  "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                  "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                      "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4}}
                    )

# Outliers

In [22]:
def handle_outliers(X):
    for feature in X.columns:
        if X[feature].dtypes != "O":
            q1 = np.percentile(X[feature], 25, interpolation='midpoint')
            median = np.percentile(X[feature], 50, interpolation='midpoint')
            q3 = np.percentile(X[feature], 75, interpolation='midpoint')
            iqr = q3 - q1
           
            upper_limit = (q3 + 1.5*iqr)
            lower_limit = (q1 - 1.5*iqr)
           
            X[feature] = np.where(X[feature] > upper_limit, median, X[feature])
            X[feature] = np.where(X[feature] < lower_limit, median, X[feature])
       
    return X

In [23]:
df7 = df6.copy()

In [24]:
df7 = handle_outliers(df7)

Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  q1 = np.percentile(X[feature], 25, interpolation='midpoint')
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  median = np.percentile(X[feature], 50, interpolation='midpoint')
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  q3 = np.percentile(X[feature], 75, interpolation='midpoint')


## Handle Missing Values: Part 2

### Seperate Features

In [25]:
# Again Separate numerical and categorial features to do round 2 of handling missing Values

In [26]:
# Separate numerical and categorial features
new_categorical_features = []
new_numerical_features = []

# Differentiate numerical features (minus the target) and categorical features
new_categorical_features = df7.select_dtypes(include = ["object"]).columns
new_numerical_features = df7.select_dtypes(exclude = ["object"]).columns
# Remove SalesPrice
new_numerical_features = new_numerical_features.drop("SalePrice")

print("Numerical features : " + str(len(new_numerical_features)))
print("Categorical features : " + str(len(new_categorical_features)))

train_num = df7[new_numerical_features]
train_cat = df7[new_categorical_features]
            
# print(len(feature_with_year), 'Temporal Features ', feature_with_year)
# print("\n\n", len(new_numerical_features), "Numerical Features ", new_numerical_features)
# print("\n\n",len(new_categorical_features),"Categorical Features ", new_categorical_features)

Numerical features : 54
Categorical features : 25


### Numerical Features

In [27]:
# 5.2 Impute by using median as replacement
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))

NAs for numerical features in train : 2840
Remaining NAs for numerical features in train : 0


### Nominal Features - One Hot Encoding

In [28]:
# Create dummy features for categorical values via one-hot encoding
print("NAs for categorical features in train : " + str(train_cat.isnull().values.sum()))
train_cat = pd.get_dummies(train_cat)
print("Remaining NAs for categorical features in train : " + str(train_cat.isnull().values.sum()))

NAs for categorical features in train : 2756
Remaining NAs for categorical features in train : 0


In [29]:
# Join categorical and numerical features
train = pd.concat([train_num, train_cat], axis = 1)
print("New number of features : " + str(train.shape[1]))

New number of features : 250


# Modeling

#### Preparing X and Y variables

In [30]:
X = train.copy()
y = df.SalePrice

In [31]:
X.shape

(1460, 250)

In [32]:
# Partition the dataset in train + validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

X_train : (1022, 250)
X_test : (438, 250)
y_train : (1022,)
y_test : (438,)


#### Create an Evaluate Function to give all metrics after model Training

In [33]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [34]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 22831.7783
- Mean Absolute Error: 15758.0130
- R2 Score: 0.9145
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 219771774.5165
- Mean Absolute Error: 26709234.3484
- R2 Score: -7114403.7380




  model = cd_fast.enet_coordinate_descent(


Lasso
Model performance for Training set
- Root Mean Squared Error: 22835.1004
- Mean Absolute Error: 15781.8630
- R2 Score: 0.9145
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 41342.4499
- Mean Absolute Error: 23000.0405
- R2 Score: 0.7482


Ridge
Model performance for Training set
- Root Mean Squared Error: 23355.0703
- Mean Absolute Error: 16024.6690
- R2 Score: 0.9106
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 38508.2651
- Mean Absolute Error: 21965.4246
- R2 Score: 0.7816


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 39530.7316
- Mean Absolute Error: 25014.0149
- R2 Score: 0.7438
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 54849.7996
- Mean Absolute Error: 30580.4064
- R2 Score: 0.5569


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.000

### Results

In [35]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.8295005
6,AdaBoost Regressor,0.7877688
2,Ridge,0.7815748
1,Lasso,0.7482398
4,Decision Tree,0.6207815
3,K-Neighbors Regressor,0.556856
0,Linear Regression,-7114404.0


## Random Forest Regressor

In [36]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 83.28
