### Importing necessary libraries

In [447]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import model_selection
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error
from scipy import stats
from sklearn.model_selection import StratifiedKFold
import warnings
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA
from scipy.stats import boxcox
import os
%matplotlib inline
np.random.seed(365)

### Loading data 

In [448]:
#load datasets
data_path_train = r"C:\Users\User\Desktop\Blessing_AI\Free_AI_Classes_2023\Data\Housing_dataset_train.csv"
df_train = pd.read_csv(data_path_train)

data_path_test = r"C:\Users\User\Desktop\Blessing_AI\Free_AI_Classes_2023\Data\Housing_dataset_test.csv"
df_test = pd.read_csv(data_path_test)

In [449]:
test = df_test.copy()

In [414]:
df_train.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price
0,3583,Katsina,Semi-detached duplex,2.0,2.0,1.0,1149999.565
1,2748,Ondo,Apartment,,2.0,4.0,1672416.689
2,9261,Ekiti,,7.0,5.0,,3364799.814
3,2224,Anambra,Detached duplex,5.0,2.0,4.0,2410306.756
4,10300,Kogi,Terrace duplex,,5.0,6.0,2600700.898


### Feature Engineering

#### Filling mising values

In [450]:
print(f"Total missing data in train data is {df_train.isnull().sum().sum()}")
print(f"Total missing data in test data  is {df_test.isnull().sum().sum()}")

Total missing data in train data is 8950
Total missing data in test data  is 0


In [451]:
#Filling missing values
#fill missing values in location with new class
df_train["loc"] = df_train["loc"].fillna("Unknown")
df_test["loc"] = df_test["loc"].fillna("Unknown")

#fill missing values in title with new class
df_train["title"] = df_train["title"].fillna("Unknown")
df_test["title"] = df_test["title"].fillna("Unknown")

#group hpuse title based on mode number of bedroom
# Group by 'Category' and calculate the mode of 'Value' for each group
def group_feature_by_feature_based_on_mode(by_feature,feature,df):
    modes_values = []
    titles = df[by_feature].unique()
    for title in titles:
        new_df = df[df[by_feature] == title]
        mode_value =  new_df[feature].mode()[0]
        modes_values.append(mode_value)
    mode_dict = dict(zip(titles, modes_values))

    return mode_dict

#fill missing values by mode house title
def fill_missing_by_mode(cols,mode_dict=mode_values):
    col1 = cols[0]
    col2 = cols[1]
    if pd.isnull(col2):
        return mode_dict[col1]
    else:
        return col2
#Fill missing values in bedroom by mode value of house title
mode_values = group_feature_by_feature_based_on_mode(by_feature = "title",feature="bedroom",df=df_train)
df_train["bedroom"] = df_train[["title","bedroom"]].apply(fill_missing_by_mode,axis = 1)

#Fill missing values in bathroon by mode value of house title
mode_values = group_feature_by_feature_based_on_mode(by_feature = "title",feature="bathroom",df=df_train)
df_train["bathroom"] = df_train[["title","bathroom"]].apply(fill_missing_by_mode,axis = 1)

#Fill missing values in parking by mode value of house titl
mode_values = group_feature_by_feature_based_on_mode(by_feature = "title",feature="parking_space",df=df_train)
df_train["parking_space"] = df_train[["title","parking_space"]].apply(fill_missing_by_mode,axis = 1)

print(f"Total missing data in train data is {df_train.isnull().sum().sum()}")
print(f"Total missing data in test data  is {df_test.isnull().sum().sum()}")

Total missing data in train data is 0
Total missing data in test data  is 0


#### Creating new features 

In [452]:
#create new feature to inducate geopolitical zone
geo_states = {"North_central":["Benue","Kogi", "Kwara", "Nasarawa", "Niger", "Plateau"],
"North_East":["Adamawa", "Bauchi", "Borno", "Gombe", "Taraba", "Yobe"],
"North_West":["Kaduna", "Katsina", "Kano", "Kebbi", "Sokoto", "Jigawa","Zamfara"],
"South_East":["Abia", "Anambra", "Ebonyi", "Enugu", "Imo"],
"South":["Akwa Ibom", "Bayelsa", "Cross River", "Delta", "Edo", "Rivers"],
"South_West":["Ekiti", "Lagos", "Osun", "Ondo", "Ogun", "Oyo"]}

def add_geo_zone(df_train):
        df_train["Geo_zone"] = df_train["loc"]
        df_train.loc[df_train["loc"].isin(geo_states["North_central"]),"Geo_zone"] = "North_central"
        df_train.loc[df_train["loc"].isin(geo_states["North_East"]),"Geo_zone"] = "North_East"
        df_train.loc[df_train["loc"].isin(geo_states["North_West"]),"Geo_zone"] = "North_West"
        df_train.loc[df_train["loc"].isin(geo_states["South_East"]),"Geo_zone"] = "South_East"
        df_train.loc[df_train["loc"].isin(geo_states["South"]),"Geo_zone"] = "South"
        df_train.loc[df_train["loc"].isin(geo_states["South_West"]),"Geo_zone"] = "South_West"
        return df_train
df_train = add_geo_zone(df_train = df_train)
df_test = add_geo_zone(df_train=df_test)

#### Encoding categorical fearures


In [453]:
df_train["price_log"] = np.log(df_train["price"] + 1)
df_train["price_sqrt"] = np.sqrt(df_train["price"])

In [454]:
#Encode house location based mean houe price ranking
#avergae pricing based on location
location_ranks = list(df_train.groupby(["loc"])["price"].mean().sort_values(ascending=False).index)
location_ranks_dict = {}
for i in location_ranks:
    location_ranks_dict[i] = location_ranks.index(i) + 1
print(location_ranks_dict)
# Use the map() function to encode the data
categories_train = df_train["loc"]
categories_test = df_test["loc"]
encoded_data_train = categories_train.map(location_ranks_dict)
encoded_data_test = categories_test.map(location_ranks_dict)
df_train["loc"] = encoded_data_train
df_test["loc"] = encoded_data_test

{'Lagos': 1, 'Bayelsa': 2, 'Rivers': 3, 'Akwa Ibom': 4, 'Delta': 5, 'Ogun': 6, 'Cross River': 7, 'Anambra': 8, 'Edo': 9, 'Oyo': 10, 'Ondo': 11, 'Enugu': 12, 'Osun': 13, 'Unknown': 14, 'Ekiti': 15, 'Kano': 16, 'Imo': 17, 'Nasarawa': 18, 'Katsina': 19, 'Plateau': 20, 'Benue': 21, 'Adamawa': 22, 'Kwara': 23, 'Niger': 24, 'Gombe': 25, 'Taraba': 26, 'Kaduna': 27, 'Bauchi': 28, 'Kogi': 29, 'Yobe': 30, 'Jigawa': 31, 'Borno': 32, 'Abia': 33, 'Zamfara': 34, 'Sokoto': 35, 'Ebonyi': 36, 'Kebbi': 37}


In [455]:
#Encode house location based mean houe price ranking
#avergae pricing based on location
location_ranks = list(df_train.groupby(["title"])["price"].mean().sort_values(ascending=False).index)
location_ranks_dict = {}
for i in location_ranks:
    location_ranks_dict[i] = location_ranks.index(i) + 1

print(location_ranks_dict)
# Use the map() function to encode the data
categories_train = df_train["title"]
categories_test = df_test["title"]
encoded_data_train = categories_train.map(location_ranks_dict)
encoded_data_test = categories_test.map(location_ranks_dict)
df_train["title"] = encoded_data_train
df_test["title"] = encoded_data_test

{'Mansion': 1, 'Penthouse': 2, 'Detached duplex': 3, 'Unknown': 4, 'Townhouse': 5, 'Terrace duplex': 6, 'Semi-detached duplex': 7, 'Bungalow': 8, 'Flat': 9, 'Apartment': 10, 'Cottage': 11}


In [456]:
#Encode house geopolotical zone  based mean houe price ranking
#avergae pricing based on title
location_ranks = list(df_train.groupby(["Geo_zone"])["price"].mean().sort_values(ascending=False).index)
location_ranks_dict = {}
for i in location_ranks:
    location_ranks_dict[i] = location_ranks.index(i) + 1
print(location_ranks_dict)
# Use the map() function to encode the data
categories_train = df_train["Geo_zone"]
categories_test = df_test["Geo_zone"]
encoded_data_train = categories_train.map(location_ranks_dict)
encoded_data_test = categories_test.map(location_ranks_dict)
df_train["Geo_zone"] = encoded_data_train
df_test["Geo_zone"] = encoded_data_test

{'South': 1, 'South_West': 2, 'Unknown': 3, 'South_East': 4, 'North_central': 5, 'North_East': 6, 'North_West': 7}


In [457]:
#Converting bedroom,bathroom and parking space to discrete variables
df_train[["bedroom","bathroom","parking_space"]] = df_train[["bedroom","bathroom","parking_space"]].astype(int)
df_test[["bedroom","bathroom","parking_space"]] = df_test[["bedroom","bathroom","parking_space"]].astype(int)
df_train.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price,Geo_zone,price_log,price_sqrt
0,3583,19,7,2,2,1,1149999.565,7,13.955273,1072.380327
1,2748,11,10,4,2,4,1672416.689,2,14.329781,1293.219505
2,9261,15,4,7,5,4,3364799.814,2,15.028879,1834.339067
3,2224,8,3,5,2,4,2410306.756,4,14.695265,1552.516266
4,10300,29,6,2,5,6,2600700.898,5,14.771292,1612.668874


In [458]:
#Adding total number of bedrooms,bathrooms and parking space
df_train["bed_bath_paking"] =  df_train["bedroom"] + df_train["bathroom"] + df_train["parking_space"]
df_test["bed_bath_paking"] =  df_test["bedroom"] + df_test["bathroom"] + df_test["parking_space"]

### Model trainig and validation

In [356]:
df_train.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price,Geo_zone,price_log,price_sqrt,bed_bath_paking
0,3583,19,7,2,2,1,1149999.565,4,13.955273,1072.380327,5
1,2748,12,9,4,2,4,1672416.689,2,14.329781,1293.219505,10
2,9261,14,6,7,5,3,3364799.814,2,15.028879,1834.339067,15
3,2224,8,3,5,2,4,2410306.756,3,14.695265,1552.516266,11
4,10300,28,5,2,5,6,2600700.898,5,14.771292,1612.668874,13


In [459]:
#Calculate the number of bins using the Sturges method
bins = int(np.ceil(np.log2(len(df_train)) + 1))
#Bin the data using the Sturges method
binned_data = pd.cut(df_train["price"], bins=bins,labels=False)
df_train["Bin_value"] = binned_data

In [176]:
"""# Step 1: Split the data into training and testing sets
#Divid data into dependent and independent variables
X = df_train.drop(["Bin_value","price","price_log","ID","price_sqrt"],axis=1)
numerical_cols = X.columns
#df_test = df_test.drop("ID",axis=1)
y = df_train["price_log"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Fit a regression model (You can replace LinearRegression with any other model)
cbr = CatBoostRegressor(random_state=0,loss_function="RMSE",silent=True)
model = LinearRegression()
cbr.fit(X_train, y_train)

# Step 3: Make predictions on the test set
y_pred = np.exp(cbr.predict(X_test))-1
y_test = np.exp(y_test)-1

# Step 4: Calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())

rmse_score = rmse(y_test, y_pred)
print("Root Mean Squared Error (RMSE):", rmse_score)"""

Root Mean Squared Error (RMSE): 601853.7131475515


In [403]:
df_train.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price,Geo_zone,price_log,price_sqrt,bed_bath_paking,Bin_value
0,3583,19,7,2,2,1,1149999.565,7,13.955273,1072.380327,5,0
1,2748,12,9,4,2,4,1672416.689,2,14.329781,1293.219505,10,1
2,9261,14,6,7,5,3,3364799.814,2,15.028879,1834.339067,15,2
3,2224,8,3,5,2,4,2410306.756,4,14.695265,1552.516266,11,1
4,10300,29,5,2,5,6,2600700.898,5,14.771292,1612.668874,13,2


In [464]:
#Divid data into dependent and independent variables
X = df_train.drop(["Bin_value","price","price_log","ID","price_sqrt"],axis=1)
numerical_cols = X.columns
df_test = df_test.drop("ID",axis=1)
y = df_train["Bin_value"]
target = "price_log"
#skf = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
scaler = MinMaxScaler()
def run(model):
    scores = []
    final_predictions = []
    for i,(train_index, test_index) in enumerate(skf.split(X, y)):
        xtrain, xvalid = X.iloc[train_index], X.iloc[test_index]
        ytrain, yvalid = df_train[target].iloc[train_index], df_train[target].iloc[test_index]
        xtest = df_test.copy()

        xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
        xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
        xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
        #Fit model
        model.fit(xtrain, ytrain)
        yvalid = np.exp(yvalid) - 1
        #make predictions on validation data
        preds_valid =  np.exp(model.predict(xvalid)) -1


        #make prediction on test data
        test_preds = np.exp(model.predict(xtest)) - 1
        final_predictions.append(test_preds)
        rmse = mean_squared_error(yvalid, preds_valid,squared=False)
        print(f"Fold {i} score : ", rmse)
        scores.append(rmse)

    print(f"Mean rmse :{np.array(scores).mean()}")
    return final_predictions

In [465]:
model = LinearRegression()
preds_new = run(model);

Fold 0 score :  662131.9638980895
Fold 1 score :  659129.2872594572
Fold 2 score :  652966.6228907097
Fold 3 score :  683077.4472933303
Fold 4 score :  672537.5064442197
Fold 5 score :  671365.6151665957
Fold 6 score :  675739.1677735692
Fold 7 score :  664672.8229665444
Fold 8 score :  731121.4899000871
Fold 9 score :  733314.0692944645
Mean rmse :680605.5992887067


In [234]:
preds_new= np.mean(np.column_stack(preds_new),axis=1)
sub = test[["ID"]]
sub["price"] = preds_new
sub.head()

Unnamed: 0,ID,price
0,845,2371974.0
1,1924,977241.9
2,10718,1161319.0
3,12076,6273581.0
4,12254,1843415.0


In [235]:
path = r"C:\Users\User\Desktop\Blessing_AI\Free_AI_Classes_2023\submissions"
os.chdir(path)
sub.to_csv("Baseline_lreg.csv",index=False)

In [466]:
#catoost
cbr = CatBoostRegressor(random_state=0,loss_function="RMSE",silent=True)
preds_new = run(cbr);
preds_new= np.mean(np.column_stack(preds_new),axis=1)
sub = test[["ID"]]
sub["price"] = preds_new
sub.head()

Fold 0 score :  568957.4041088577
Fold 1 score :  477776.36863818555
Fold 2 score :  538982.5742320182
Fold 3 score :  543260.5939830546
Fold 4 score :  561906.7569989029
Fold 5 score :  527476.7521859676
Fold 6 score :  574830.5820552923
Fold 7 score :  570572.6086555828
Fold 8 score :  618900.2047777285
Fold 9 score :  539117.5803402871
Mean rmse :552178.1425975878


Unnamed: 0,ID,price
0,845,2384614.0
1,1924,1032201.0
2,10718,1468068.0
3,12076,8823767.0
4,12254,1875181.0


In [467]:
sub.to_csv("Baseline_cbt_log_latest_now.csv",index=False)

In [262]:
#xgboost
xgb_r = xgb.XGBRegressor(random_state=0)
preds_new = run(xgb_r);
preds_new= np.mean(np.column_stack(preds_new),axis=1)
sub = test[["ID"]]
sub["price"] = preds_new
sub.head()

Fold 0 score :  588864.6566043296
Fold 1 score :  539208.849864948
Fold 2 score :  490213.9131798452
Fold 3 score :  617181.1192674245
Fold 4 score :  554990.9677057235
Fold 5 score :  495226.72694395314
Fold 6 score :  486758.95826003194
Fold 7 score :  564277.565220934
Fold 8 score :  554563.6902205719
Fold 9 score :  630663.5994796933
Fold 10 score :  614461.31354569
Fold 11 score :  544784.4508017757
Fold 12 score :  599444.1283603649
Fold 13 score :  544272.5103345743
Fold 14 score :  609784.0482371405
Mean rmse :562313.0998684667


Unnamed: 0,ID,price
0,845,2308672.0
1,1924,1019366.0
2,10718,1427042.0
3,12076,8804663.0
4,12254,1862379.0


In [263]:
sub.to_csv("Baseline_xgb.csv",index=False)

In [430]:
#light gbm
lgm = GradientBoostingRegressor(random_state=0)
lgm_preds = run(lgm);

Fold 0 score :  568102.0458240584
Fold 1 score :  495157.40113839007
Fold 2 score :  551172.5224883802
Fold 3 score :  557527.781643128
Fold 4 score :  561302.7242180089
Fold 5 score :  545167.2712150593
Fold 6 score :  567020.7367283601
Fold 7 score :  550775.5489949263
Fold 8 score :  584938.5932937949
Fold 9 score :  548729.4960458819
Mean rmse :552989.4121589989


In [383]:
preds_new= np.mean(np.column_stack(lgm_preds),axis=1)
sub = test[["ID"]]
sub["price"] = preds_new
sub.head()

Unnamed: 0,ID,price
0,845,2324500.0
1,1924,1070627.0
2,10718,1459951.0
3,12076,9126634.0
4,12254,1860693.0


In [384]:
sub.to_csv("gbr_baseline.csv",index=False)