#### Table of contents
- Importing libraries
- Loading datasets
- Feature Engineering
- Cross validation
- Model training
- Model evaluation
- Summary

#### Importing libraries


In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import model_selection
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_error,r2_score
from scipy import stats
from sklearn.model_selection import StratifiedKFold
import warnings
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor,RandomForestClassifier
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA
from scipy.stats import boxcox
import os
%matplotlib inline
import optuna
from functools import partial
import warnings
warnings.filterwarnings("ignore")
np.random.seed(365)

#### Loading datasets

In [50]:
#load datasets
data_path_train = r"C:\Users\User\Desktop\Blessing_AI\Free_AI_Classes_2023\Data\Housing_dataset_train.csv"
df_train = pd.read_csv(data_path_train)
data_path_test = r"C:\Users\User\Desktop\Blessing_AI\Free_AI_Classes_2023\Data\Housing_dataset_test.csv"
df_test = pd.read_csv(data_path_test)
test = df_test.copy()

In [51]:
df_train.shape

(14000, 7)

#### Feature Engineering

##### Treating missing values

In [52]:
df_train.isnull().sum()

ID                  0
loc              1813
title            1722
bedroom          1799
bathroom         1805
parking_space    1811
price               0
dtype: int64

In [53]:
df_train.dropna(subset=["loc","title","bedroom"],inplace=True)
df_train.shape

(8856, 7)

In [54]:
#Taking square root, cube root and logging output feature
df_train["price_log"] = np.log(df_train["price"] + 1)
df_train["price_sqrt"] = np.sqrt(df_train["price"])
df_train["price_cube"] = np.cbrt(df_train["price"])

In [55]:

        # Separate the dataset into features (X) and target (y)
def predict_nan_by_feature(df_train,x,y_):
            X = df_train[x]
            y = df_train[y_]
            
            # Identify rows with missing values (assuming NaN represents missing values)
            rows_with_missing = df_train.isnull().any(axis=1)
            
            # Create a copy of the dataset for imputation
            imputation_data = df_train.copy()
            
            # Separate the data into rows with missing values and rows without
            data_with_missing = imputation_data[rows_with_missing]
            data_without_missing = imputation_data[~rows_with_missing]
            
            # Separate the data without missing values into features and target
            X_train = np.array(data_without_missing[x]).reshape(-1,1)
            y_train = np.array(data_without_missing[y_]).reshape(-1,1)
            
            # Train a model (using RandomForestRegressor as an example)
            model = RandomForestRegressor()
            model.fit(X_train, y_train)
            
            # Drop the target column from the data with missing values
            data_with_missing = np.array(data_with_missing[x]).reshape(-1,1)
            
            # Predict the missing values using the trained model
            predicted_values = model.predict(data_with_missing)
            
            # Impute the predicted values back into the original dataset
            imputation_data.loc[rows_with_missing, y_] = predicted_values
            return imputation_data
        
    
#predict missing values in bedroom  by price
df_train = predict_nan_by_feature(df_train=df_train,x="price_log",y_="bedroom")
    
#predict missing values in bedroom  by price
df_train = predict_nan_by_feature(df_train=df_train,x="price_log",y_= "bathroom")

#predict missing values in bedroom  by price
df_train = predict_nan_by_feature(df_train=df_train,x="price_log",y_="parking_space")

In [56]:
df_train.isnull().sum()

ID               0
loc              0
title            0
bedroom          0
bathroom         0
parking_space    0
price            0
price_log        0
price_sqrt       0
price_cube       0
dtype: int64

In [57]:
df_train.shape

(8856, 10)

##### Filling missing values

In [37]:

def group_feature_by_feature_based_on_mode(by_feature,feature,df):
    modes_values = []
    titles = list(df[by_feature].unique())
    for title in titles:
        new_df = df[df[by_feature] == title]
        mode_value =  new_df[feature].mode()[0]
        modes_values.append(mode_value)
    mode_dict = dict(zip(titles, modes_values))
    print(mode_dict)

    return mode_dict
#Fill missing values in bathroon by mode value of house title
mode_values = group_feature_by_feature_based_on_mode(by_feature = "title",feature="bathroom",df=df_train)
#fill missing values by mode house title
def fill_missing_by_mode(cols,mode_dict=mode_values):
    col1 = cols[0]
    col2 = cols[1]
    if pd.isnull(col2):
        return mode_dict[col1]
    else:
        return col2


df_train["bathroom"] = df_train[["title","bathroom"]].apply(fill_missing_by_mode,axis = 1)

#Fill missing values in parking by mode value of house titl
mode_values = group_feature_by_feature_based_on_mode(by_feature = "title",feature="parking_space",df=df_train)
df_train["parking_space"] = df_train[["title","parking_space"]].apply(fill_missing_by_mode,axis = 1)

print(f"Total missing data in train data is {df_train.isnull().sum().sum()}")
print(f"Total missing data in test data  is {df_test.isnull().sum().sum()}")

{'Semi-detached duplex': 2.0, 'Apartment': 1.0, 'Detached duplex': 1.0, 'Terrace duplex': 1.0, 'Mansion': 1.0, 'Bungalow': 2.0, 'Penthouse': 1.0, 'Townhouse': 1.0, 'Flat': 2.0, 'Cottage': 1.0}
{'Semi-detached duplex': 2.0, 'Apartment': 4.0, 'Detached duplex': 2.0, 'Terrace duplex': 4.0, 'Mansion': 2.0, 'Bungalow': 4.0, 'Penthouse': 4.0, 'Townhouse': 1.0, 'Flat': 2.0, 'Cottage': 4.0}
Total missing data in train data is 0
Total missing data in test data  is 0


##### Creating new features

In [58]:
#create new feature to inducate geopolitical zone
geo_states = {"North_central":["Benue","Kogi", "Kwara", "Nasarawa", "Niger", "Plateau"],
"North_East":["Adamawa", "Bauchi", "Borno", "Gombe", "Taraba", "Yobe"],
"North_West":["Kaduna", "Katsina", "Kano", "Kebbi", "Sokoto", "Jigawa","Zamfara"],
"South_East":["Abia", "Anambra", "Ebonyi", "Enugu", "Imo"],
"South":["Akwa Ibom", "Bayelsa", "Cross River", "Delta", "Edo", "Rivers"],
"South_West":["Ekiti", "Lagos", "Osun", "Ondo", "Ogun", "Oyo"]}

def add_geo_zone(df_train):
        df_train["Geo_zone"] = df_train["loc"]
        df_train.loc[df_train["loc"].isin(geo_states["North_central"]),"Geo_zone"] = "North_central"
        df_train.loc[df_train["loc"].isin(geo_states["North_East"]),"Geo_zone"] = "North_East"
        df_train.loc[df_train["loc"].isin(geo_states["North_West"]),"Geo_zone"] = "North_West"
        df_train.loc[df_train["loc"].isin(geo_states["South_East"]),"Geo_zone"] = "South_East"
        df_train.loc[df_train["loc"].isin(geo_states["South"]),"Geo_zone"] = "South"
        df_train.loc[df_train["loc"].isin(geo_states["South_West"]),"Geo_zone"] = "South_West"
        return df_train
df_train = add_geo_zone(df_train = df_train)
df_test = add_geo_zone(df_train = df_test)

##### Encoding categorical features

In [59]:
#Taking square root, cube root and logging output feature
df_train["price_log"] = np.log(df_train["price"] + 1)
df_train["price_sqrt"] = np.sqrt(df_train["price"])
df_train["price_cube"] = np.cbrt(df_train["price"])

In [60]:
#Encode house location based mean houe price ranking
#avergae pricing based on location
location_ranks = list(df_train.groupby(["loc"])["price_log"].mean().sort_values(ascending=False).index)
location_ranks_vals = list(df_train.groupby(["loc"])["price_log"].mean().sort_values(ascending=False).values)
location_ranks_dict = {}
for i,j in zip(location_ranks,location_ranks_vals):
    location_ranks_dict[i] = j
print(location_ranks_dict)
# Use the map() function to encode the data
categories_train = df_train["loc"]
categories_test = df_test["loc"]
encoded_data_train = categories_train.map(location_ranks_dict)
encoded_data_test = categories_test.map(location_ranks_dict)
df_train["loc"] = encoded_data_train
df_test["loc"] = encoded_data_test

{'Lagos': 15.164201750440764, 'Bayelsa': 14.886269812374607, 'Rivers': 14.807672123027194, 'Akwa Ibom': 14.743598986159991, 'Delta': 14.7148367100054, 'Ogun': 14.688530952997048, 'Cross River': 14.66378009139607, 'Anambra': 14.586379546363371, 'Oyo': 14.583855080167194, 'Edo': 14.578549286833502, 'Enugu': 14.569031980098126, 'Ondo': 14.55682652657201, 'Osun': 14.51749451121393, 'Ekiti': 14.513425575326508, 'Kano': 14.481509449571682, 'Nasarawa': 14.456433601921779, 'Imo': 14.442751966809734, 'Katsina': 14.41534982377515, 'Plateau': 14.389817938226313, 'Benue': 14.385893546358306, 'Kwara': 14.379143188461923, 'Adamawa': 14.375183993010024, 'Taraba': 14.365381784406637, 'Niger': 14.365256701446995, 'Kaduna': 14.36002146533926, 'Gombe': 14.33417112850962, 'Kogi': 14.30886528345092, 'Bauchi': 14.283238132256606, 'Yobe': 14.28075491158785, 'Jigawa': 14.27540458344726, 'Borno': 14.272336809801947, 'Abia': 14.271971038099514, 'Zamfara': 14.268366027934789, 'Sokoto': 14.247858492020116, 'Ebony

In [61]:
#Encode house location based mean houe price ranking
#avergae pricing based on location
location_ranks = list(df_train.groupby(["title"])["price_log"].mean().sort_values(ascending=False).index)
location_ranks_vals = list(df_train.groupby(["title"])["price_log"].mean().sort_values(ascending=False).values)
location_ranks_dict = {}
for i,j in zip(location_ranks,location_ranks_vals):
    location_ranks_dict[i] = j
print(location_ranks_dict)
# Use the map() function to encode the data
categories_train = df_train["title"]
categories_test = df_test["title"]
encoded_data_train = categories_train.map(location_ranks_dict)
encoded_data_test = categories_test.map(location_ranks_dict)
df_train["title"] = encoded_data_train
df_test["title"] = encoded_data_test

{'Mansion': 15.038973416880724, 'Penthouse': 14.728088824321503, 'Detached duplex': 14.556294050503396, 'Townhouse': 14.46983941963301, 'Terrace duplex': 14.401258616672163, 'Semi-detached duplex': 14.39968496038179, 'Flat': 14.301541574931527, 'Bungalow': 14.299494067095424, 'Apartment': 14.207836267874152, 'Cottage': 14.010785242580935}


In [62]:
#Encode house geopolotical zone  based mean houe price ranking
#avergae pricing based on title
location_ranks = list(df_train.groupby(["Geo_zone"])["price_log"].mean().sort_values(ascending=False).index)
location_ranks_vals = list(df_train.groupby(["Geo_zone"])["price_log"].mean().sort_values(ascending=False).values)
location_ranks_dict = {}
for i in location_ranks:
    location_ranks_dict[i] = location_ranks.index(i) + 1
print(location_ranks_dict)
# Use the map() function to encode the data
categories_train = df_train["Geo_zone"]
categories_test = df_test["Geo_zone"]
encoded_data_train = categories_train.map(location_ranks_dict)
encoded_data_test = categories_test.map(location_ranks_dict)
df_train["Geo_zone"] = encoded_data_train
df_test["Geo_zone"] = encoded_data_test

{'South': 1, 'South_West': 2, 'South_East': 3, 'North_central': 4, 'North_West': 5, 'North_East': 6}


#### Adding more informative features

In [63]:
#Converting bedroom,bathroom and parking space to discrete variables
df_train[["bedroom","bathroom","parking_space"]] = df_train[["bedroom","bathroom","parking_space"]].astype(int)
df_test[["bedroom","bathroom","parking_space"]] = df_test[["bedroom","bathroom","parking_space"]].astype(int)

#Adding total number of bedrooms,bathrooms and parking space
df_train["bed_bath_paking"] =  df_train["bedroom"] + df_train["bathroom"] + df_train["parking_space"]
df_test["bed_bath_paking"] =  df_test["bedroom"] + df_test["bathroom"] + df_test["parking_space"]

#compare ratio of parking_space to bedroom
df_train["parking_bedroom_ratio"] =  df_train["parking_space"] /  df_train["bedroom"]
df_test["parking_bedroom_ratio"] =  df_test["parking_space"] / df_test["bedroom"]

#Adding ranking of loaction and house title as a feature
df_train["Rank_loc"] = df_train["loc"] + df_train["title"]
df_test["Rank_loc"] = df_test["loc"] +  df_test["title"]
df_train.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price,price_log,price_sqrt,price_cube,Geo_zone,bed_bath_paking,parking_bedroom_ratio,Rank_loc
0,3583,14.41535,14.399685,2,2,1,1149999.565,13.955273,1072.380327,104.768942,5,5,0.5,28.815035
3,2224,14.58638,14.556294,5,2,4,2410306.756,14.695265,1552.516266,134.077974,3,11,0.8,29.142674
7,3003,14.41535,14.728089,3,3,5,2043107.592,14.529983,1429.373147,126.890881,5,11,1.666667,29.143439
10,12573,14.688531,14.299494,1,2,6,1330213.036,14.10085,1153.348619,109.978316,2,9,6.0,28.988025
11,2624,14.88627,14.207836,3,4,2,1891772.069,14.453025,1375.417053,123.677188,1,9,0.666667,29.094106


#### Cross Validation

In [64]:
#Calculate the number of bins using the Sturges method
bins = int(np.ceil(np.log2(len(df_train)) + 1))
#Bin the data using the Sturges method
binned_data = pd.cut(df_train["price_log"], bins=bins,labels=False)
df_train["Bin_value"] = binned_data

In [65]:
#dictionary of models to be used
models = {"Linear_Regression":LinearRegression(),
          "light_gradient_boistingt":lgb.LGBMRegressor(random_state=0,verbose=0),"XGboost":xgb.XGBRegressor(random_state=0),
          "Catboost":CatBoostRegressor(random_state=0,silent=True),"Gradient_boosting":GradientBoostingRegressor(random_state=0)}

#Divid data into dependent and independent variables
X = df_train.drop(["Bin_value","price","price_log","ID","price_sqrt","price_cube"],axis=1)
scale_features = ["Rank_loc","parking_bedroom_ratio","bed_bath_paking"]
#use_cols = ['Rank_loc','title','bedroom','bed_bath_paking','loc','bathroom','parking_bedroom_ratio','Geo_zone']
X = df_train[["title","bedroom","bathroom","loc","bed_bath_paking","Rank_loc","parking_space","parking_bedroom_ratio"]]
cols = X.columns
y = df_train["Bin_value"]
target = "price_log"
#skf = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

#Custom cross validation function
def run(models):
    #create empty dic for model and scores
    scores = {}
    for model in models.keys():
        scores[model] = [] #create empty lsit to store model scores at on each fold
    for name,model in models.items():
        print(f"Running -- {name}")
        print("-------------------------")
        for i,(train_index, test_index) in enumerate(skf.split(X, y)):
            xtrain, xvalid = X.iloc[train_index], X.iloc[test_index]
            ytrain, yvalid = df_train[target].iloc[train_index], df_train[target].iloc[test_index]
            model.fit(xtrain, ytrain)
            yvalid = np.exp(yvalid) - 1
            #make predictions on validation data
            preds_valid =  np.exp(model.predict(xvalid)) - 1
            rmse = mean_squared_error(yvalid, preds_valid,squared=False)
            print(f"Fold {i} score : ", rmse)
            scores[name].append(rmse)
        print(f"{model} -- mean rmse {np.mean(scores[name])}")
        print()

    #take the mean of scores for every model
    for name, model in models.items():
        scores[name] = np.mean(np.array(scores[name]))
    model_names = scores.keys()
    model_scores = scores.values()
    results = {"Model":model_names,"Rmse_score":model_scores}
    results = pd.DataFrame.from_dict(results) #create dictionary of model and corresponding mean rmse score
    return results

In [66]:
results = run(models = models)

Running -- Linear_Regression
-------------------------
Fold 0 score :  551115.6516341345
Fold 1 score :  578676.3161392485
Fold 2 score :  600049.9988608193
Fold 3 score :  508258.79962035915
Fold 4 score :  417506.1104045649
Fold 5 score :  446529.47901438875
Fold 6 score :  435114.6874244511
Fold 7 score :  399868.53548630205
Fold 8 score :  415727.7871992823
Fold 9 score :  481124.5412731771
LinearRegression() -- mean rmse 483397.1907056727

Running -- light_gradient_boistingt
-------------------------
You can set `force_col_wise=true` to remove the overhead.
Fold 0 score :  462512.04573357117
You can set `force_col_wise=true` to remove the overhead.
Fold 1 score :  525906.6457154682
You can set `force_col_wise=true` to remove the overhead.
Fold 2 score :  497703.583229485
You can set `force_col_wise=true` to remove the overhead.
Fold 3 score :  481538.2731271902
You can set `force_col_wise=true` to remove the overhead.
Fold 4 score :  405025.41991009086
You can set `force_col_wise=

In [68]:
486000 - 471000

15000

In [69]:
sorted_df = results.sort_values(by="Rmse_score",ascending=True)
sorted_df.head()

Unnamed: 0,Model,Rmse_score
4,Gradient_boosting,443360.542112
3,Catboost,444046.078772
1,light_gradient_boistingt,451858.632717
2,XGboost,471739.32338
0,Linear_Regression,483397.190706


#### Training and predicting on test data

In [70]:
def train_predict(model):
    scores = []
    final_predictions = []
    for i,(train_index, test_index) in enumerate(skf.split(X, y)):
        xtrain, xvalid = X.iloc[train_index], X.iloc[test_index]
        ytrain, yvalid = df_train[target].iloc[train_index], df_train[target].iloc[test_index]
        xtest = df_test[cols].copy()

        #Fit model
        model.fit(xtrain, ytrain)
        yvalid = np.exp(yvalid) - 1
        #make predictions on validation data
        preds_valid =  np.exp(model.predict(xvalid)) - 1


        #make prediction on test data
        test_preds = np.exp(model.predict(xtest))  - 1
        final_predictions.append(test_preds)
        rmse = mean_squared_error(yvalid, preds_valid,squared=False)
        print(f"Fold {i} score : ", rmse)
        scores.append(rmse)

    print(f"Mean rmse :{np.array(scores).mean()}")
    return final_predictions
model_cbr = CatBoostRegressor(silent=True,random_state=0)

preds = train_predict(model_cbr);
preds_new= np.mean(np.column_stack(preds),axis=1)
sub = test[["ID"]]
sub["price_cat"] = preds_new

#Traing and predict with gradient boosting
gb_model_1 = GradientBoostingRegressor(random_state=0,criterion="squared_error")
preds_gb = train_predict(gb_model_1);
preds_gb= np.mean(np.column_stack(preds_gb),axis=1)
sub["price_gb"] = preds_gb

#Take mean of the two models
sub["price"] = (sub["price_cat"] + sub["price_gb"])/2
sub = sub.drop(["price_gb","price_cat"],axis=1)
sub.head()

Fold 0 score :  392911.0837119051
Fold 1 score :  529883.0477552931
Fold 2 score :  426515.6591020588
Fold 3 score :  482230.15651107416
Fold 4 score :  405426.83256739005
Fold 5 score :  479793.4863974008
Fold 6 score :  443361.8593024264
Fold 7 score :  383113.08637933346
Fold 8 score :  415784.2159047825
Fold 9 score :  481441.3600868243
Mean rmse :444046.0787718488
Fold 0 score :  442501.32767711993
Fold 1 score :  500329.26314067806
Fold 2 score :  490412.1791852337
Fold 3 score :  474967.72342950024
Fold 4 score :  398832.17576577055
Fold 5 score :  503091.85823464143
Fold 6 score :  429658.0681393302
Fold 7 score :  376487.58670485136
Fold 8 score :  397261.99780997296
Fold 9 score :  420063.2410313996
Mean rmse :443360.54211184976


Unnamed: 0,ID,price
0,845,2264470.0
1,1924,1081700.0
2,10718,1279374.0
3,12076,8748906.0
4,12254,1886399.0


In [71]:
#save submission file
path = r"C:\Users\User\Desktop\Blessing_AI\Free_AI_Classes_2023\Submissions"
os.chdir(path)
sub.to_csv("xgb_cat_mean.csv",index=False)