In [1]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor,DMatrix
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.preprocessing import LabelEncoder

In [2]:
path = "/Users/thrilok/Desktop/mantra_collab_job/work_files/latest_broko_code"
df = pd.read_csv(path  + "/Dataset/ML_CLEAN_DATA__Bdv2.3_RES.csv")
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180356 entries, 0 to 180355
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ML_Number       180356 non-null  object 
 1   Postal_Code     180332 non-null  object 
 2   Postal_Short    180332 non-null  object 
 3   Style           180356 non-null  object 
 4   Type            180356 non-null  object 
 5   Cluster         180356 non-null  object 
 6   List_Price      180356 non-null  float64
 7   Cluster_Price   180356 non-null  float64
 8   Taxes           180356 non-null  float64
 9   Cluster_Tax     180356 non-null  float64
 10  Bedrooms        180356 non-null  int64  
 11  Washrooms       180356 non-null  int64  
 12  Basement1       180356 non-null  object 
 13  Days_On_Market  180356 non-null  int64  
 14  Exterior1       180356 non-null  object 
 15  Garage_Type     152749 non-null  object 
 16  lat             180341 non-null  float64
 17  lng       

In [3]:
drop_cols = ["ML_Number","Postal_Code","Sold_Price","Month_Year","HPI_for_Month"]
data = df.drop(drop_cols, axis=1)
data.isnull().sum()

Postal_Short         24
Style                 0
Type                  0
Cluster               0
List_Price            0
Cluster_Price         0
Taxes                 0
Cluster_Tax           0
Bedrooms              0
Washrooms             0
Basement1             0
Days_On_Market        0
Exterior1             0
Garage_Type       27607
lat                  15
lng                  15
HPI_Sold_Price        0
dtype: int64

In [4]:
data = data.dropna(axis = 0, how ='any') 

In [5]:
q1 = data['HPI_Sold_Price'].quantile(0.25)
q3 = data['HPI_Sold_Price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
lower_bound_outliers = data[data['HPI_Sold_Price'] < lower_bound] 
upper_bound_outliers = data[data['HPI_Sold_Price'] > upper_bound]
lower_bound_outliers.to_csv(path + '/Dataset/lower_bound_outliers.csv')
upper_bound_outliers.to_csv(path +'/Dataset/upper_bound_outliers.csv')    
dataset = data[(data['HPI_Sold_Price'] >= lower_bound) & (data['HPI_Sold_Price'] <= upper_bound)]

In [6]:
dataset.isnull().sum()

Postal_Short      0
Style             0
Type              0
Cluster           0
List_Price        0
Cluster_Price     0
Taxes             0
Cluster_Tax       0
Bedrooms          0
Washrooms         0
Basement1         0
Days_On_Market    0
Exterior1         0
Garage_Type       0
lat               0
lng               0
HPI_Sold_Price    0
dtype: int64

In [7]:
X = dataset.drop('HPI_Sold_Price',axis=1)
y = dataset['HPI_Sold_Price']

In [8]:
categorical_features = [column for column, dtype in X.dtypes.items() if dtype==object]


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
best_params = pd.read_csv(path + "/Dataset/best.csv")
best_params

Unnamed: 0,model_type,learning_rate,depth,l2_leaf_reg,boosting_type,max_ctr_complexity
0,catboost,0.02846,10,1.218826,Plain,6


In [11]:
# Define the XGBoost model
def xgb_model():
    params = {
        'learning_rate': best_params['learning_rate'][0],
        'max_depth':best_params['max_depth'][0] ,
        'n_estimators': best_params['n_estimators'][0],
        'reg_lambda':best_params['reg_lambda'][0],
        'reg_alpha':best_params['reg_alpha'][0]
    }
    

    model = XGBRegressor(**params)

    x_valid = X_valid


    # Encode categorical features before training
    label_encoders = {}
    for feature in categorical_features:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_valid[feature] = le.fit_transform(X_valid[feature])
        label_encoders[feature] = le

    model.fit(
        X_train,y_train,
        eval_set=[(X_valid,y_valid)],
        early_stopping_rounds=10,
        verbose=False,
    )

    y_pred = model.predict(X_valid)

    return MAPE(y_valid, y_pred),x_valid,y_pred,model,y_valid


In [12]:
# Define the CatBoost model

def catboost_model():
    params = {
        'learning_rate': best_params['learning_rate'][0],
        'depth':best_params['depth'][0] ,
        'l2_leaf_reg': best_params['l2_leaf_reg'][0],
        'boosting_type':best_params['boosting_type'][0],
        'max_ctr_complexity': best_params['max_ctr_complexity'][0]
        }

    model = CatBoostRegressor(**params)
    
    x_valid = X_valid

    cat_train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    cat_val_pool = Pool(X_valid, y_valid, cat_features=categorical_features)

    model.fit(cat_train_pool, eval_set=cat_val_pool, early_stopping_rounds=10)
    
    y_pred = model.predict(cat_val_pool)
    
    return MAPE(y_valid, y_pred),x_valid,y_pred,model,y_valid

In [13]:
# Define the LightGBM model
def lightgbm_model(trial):
    params = {
        'learning_rate': best_params['learning_rate'][0],
        'max_depth':best_params['max_depth'][0] ,
        'n_estimators': best_params['n_estimators'][0],
        'reg_lambda':best_params['reg_lambda'][0],
        'reg_alpha':best_params['reg_alpha'][0]
        }

    x_valid = X_valid

    # Encode categorical features before training
    label_encoders = {}
    for feature in categorical_features:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_valid[feature] = le.fit_transform(X_valid[feature])
        label_encoders[feature] = le

    model = LGBMRegressor(**params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        categorical_feature=categorical_features)
    

    y_pred = model.predict(X_valid)

    return MAPE(y_valid, y_pred),x_valid,y_pred,model,y_valid


In [14]:
best_params['model_type'][0]

'catboost'

In [15]:
if  best_params['model_type'][0] == "xgboost":
      xgb_mape,x_valid,y_pred,model,y_valid= xgb_model()
      print(f"xgb Mape: {xgb_mape}")
elif best_params['model_type'][0] == "catboost":
     cb_mape,x_valid,y_pred,model,y_valid= catboost_model()
     print(f"catboost Mape: {cb_mape}")
else:
      lgb_mape,x_valid,y_pred,model,y_valid = lightgbm_model()
      print(f"lgb Mape: {lgb_mape}")


0:	learn: 390412.7619225	test: 392036.3152519	best: 392036.3152519 (0)	total: 138ms	remaining: 2m 17s
1:	learn: 380900.8570778	test: 382452.8214511	best: 382452.8214511 (1)	total: 207ms	remaining: 1m 43s
2:	learn: 371755.3367731	test: 373268.9309823	best: 373268.9309823 (2)	total: 241ms	remaining: 1m 20s
3:	learn: 362813.1927109	test: 364273.1629436	best: 364273.1629436 (3)	total: 293ms	remaining: 1m 13s
4:	learn: 353891.0360023	test: 355269.8846373	best: 355269.8846373 (4)	total: 366ms	remaining: 1m 12s
5:	learn: 345246.7064667	test: 346578.2967549	best: 346578.2967549 (5)	total: 429ms	remaining: 1m 11s
6:	learn: 337109.2729715	test: 338380.9159571	best: 338380.9159571 (6)	total: 486ms	remaining: 1m 8s
7:	learn: 329233.4872748	test: 330451.9137381	best: 330451.9137381 (7)	total: 548ms	remaining: 1m 7s
8:	learn: 321404.1070808	test: 322572.7631516	best: 322572.7631516 (8)	total: 603ms	remaining: 1m 6s
9:	learn: 313846.6061473	test: 314974.2284986	best: 314974.2284986 (9)	total: 666ms	r

In [16]:
model.feature_importances_

array([ 2.57085775,  0.95038911,  0.7455494 ,  0.6129825 , 60.24181172,
        7.81986183,  7.54255608,  1.96975939,  1.08224331,  2.35537338,
        0.53760453,  6.80527282,  0.91290814,  0.99250101,  2.29917376,
        2.56115526])

In [17]:
for feature,weights in zip(X.columns,model.feature_importances_):
    print(f"{feature} : {weights}")

Postal_Short : 2.5708577542129314
Style : 0.9503891099680422
Type : 0.7455494022484341
Cluster : 0.6129825013576335
List_Price : 60.2418117214345
Cluster_Price : 7.819861829330761
Taxes : 7.542556077434884
Cluster_Tax : 1.969759394850091
Bedrooms : 1.0822433070505941
Washrooms : 2.3553733845603215
Basement1 : 0.5376045287917004
Days_On_Market : 6.805272820709363
Exterior1 : 0.9129081399819995
Garage_Type : 0.9925010080151823
lat : 2.2991737580780445
lng : 2.5611552619754585


In [18]:
predictions = x_valid
predictions['ML_Number'] = df['ML_Number']
predictions['HPI_Sold_Price'] = y_valid.astype(int)
predictions['Predicted_HPI_Price']  = y_pred.astype(int).astype(int)
predictions['Diff']  = abs(predictions['HPI_Sold_Price'] - predictions['Predicted_HPI_Price']).astype(int)
predictions['Accuracy'] = (abs(1 - (abs(predictions['HPI_Sold_Price'] - predictions['Predicted_HPI_Price']))/predictions['HPI_Sold_Price']) * 100)
predictions['MAPE'] = (predictions['Diff']/predictions['HPI_Sold_Price'] * 100)
predictions

Unnamed: 0,Postal_Short,Style,Type,Cluster,List_Price,Cluster_Price,Taxes,Cluster_Tax,Bedrooms,Washrooms,...,Exterior1,Garage_Type,lat,lng,ML_Number,HPI_Sold_Price,Predicted_HPI_Price,Diff,Accuracy,MAPE
113126,l4a,2-storey,Detached,l4a Detached,1098000.0,1.607666e+06,4828.32,5754.071501,4,4,...,Brick,Attached,43.983135,-79.234388,N5360178,1365132,1346225,18907,98.615006,1.384994
11686,k8n,1 1/2 storey,Detached,k8n Detached,199900.0,6.727084e+05,2015.67,5195.213053,2,1,...,Vinyl Siding,Other,44.159867,-77.381395,X6557189,273041,310889,37848,86.138346,13.861654
33054,l3b,2-storey,Detached,l3b Detached,514900.0,6.809488e+05,3056.99,3072.023164,3,2,...,Brick Front,Detached,42.987975,-79.234474,X5313997,533801,519163,14638,97.257780,2.742220
152663,l0a,2-storey,Detached,l0a Detached,1099900.0,1.030156e+06,6324.06,4412.148112,4,4,...,Stone,Attached,44.160512,-78.453949,X6012047,1126096,1157237,31141,97.234605,2.765395
93634,l6s,Backsplit,Detached,l6s Detached,920000.0,1.101814e+06,4364.47,4574.549105,3,2,...,Brick,Attached,43.731869,-79.724575,W5714276,923836,1012625,88789,90.389095,9.610905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25981,l9r,2-storey,Att/Row/Twnhouse,l9r Att/Row/Twnhouse,649000.0,7.738573e+05,2810.39,2956.917636,3,2,...,Brick,Attached,44.164753,-79.878333,N5370932,727081,659853,67228,90.753712,9.246288
88571,l9c,3 storey,Att/Row/Twnhouse,l9c Att/Row/Twnhouse,779900.0,8.042118e+05,4259.72,2994.477059,3,3,...,Stone,Attached,43.239029,-79.917920,X6025547,857978,826904,31074,96.378229,3.621771
47386,l6m,3 storey,Att/Row/Twnhouse,l6m Att/Row/Twnhouse,949000.0,1.198666e+06,3429.90,3957.696983,3,3,...,Brick,Built-In,43.447420,-79.762695,W5431216,1050160,1080937,30777,97.069304,2.930696
53592,l0r,bungalow,Detached,l0r Detached,799900.0,1.165328e+06,3569.88,4850.322393,2,2,...,Brick,Detached,43.194571,-79.934089,X5389094,966121,861568,104553,89.178064,10.821936


In [19]:
Avg_MAPE = (predictions['MAPE'].sum() / len(predictions['MAPE'])).astype(int)
print(f'The Average MAPE Error is: {Avg_MAPE}')

The Average MAPE Error is: 11


In [20]:
Avg_Accuracy = (predictions['Accuracy'].sum()/len(predictions['Accuracy'])).astype(int)
print(f'The Average Accuracy is: {Avg_Accuracy}')

The Average Accuracy is: 98


In [21]:
# save the dataframe as a csv file
predictions.to_csv(path + "/Dataset/best_Predictions_residential.csv",index=False)

In [22]:
# import pickle

# pickle.dump(model, open(path + '/models/' + best_params['model_type'][0]+'.pkl', 'wb'))