In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_percentage_error,r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import PartialDependenceDisplay
import category_encoders as ce
import scikitplot as skplt

import copy
import time
import os
import threading
from pathlib import Path
from joblib import dump, load
import re

In [2]:
target = 'price'
categorical_columns = ['city','layout_type','property_type','furnish_type','locality','seller_type']
continuous_columns = ['bedroom','area','bathroom']

In [3]:
data = pd.read_csv('./data/data.csv')

In [4]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer_target = Pipeline(steps=[
    ('encoder', ce.CatBoostEncoder())])

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_columns),
        ('cat', categorical_transformer_target, categorical_columns)])

In [10]:
regressors = [LinearRegression(n_jobs=-1),
              DecisionTreeRegressor(),
              RandomForestRegressor(n_jobs=-1),
              AdaBoostRegressor(),
              #SVR(),
              XGBRegressor(n_jobs=-1)]

In [11]:
pipelines = []
        
for regressor in regressors:
    pipelines.append(make_pipeline(preprocessor,regressor,memory='./cache'))

In [12]:
def fit_pipeline(pipe,xtrain,xtest,ytrain,ytest,overwrite_saved_models=False):
    model_folder = './saved_models/'
    file_name = re.sub('[^a-zA-Z0-9 \n\.]', '', list(pipe.named_steps.keys())[-1])
    print(file_name)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not Path(model_folder+file_name+'.joblib').is_file() or overwrite_saved_models:
        start_time = time.time()
        logs = 'Model '+file_name+'\n'
        print('Training started')
        pipe.fit(xtrain, ytrain)
        predictions = pipe.predict(xtest)
        execution_time = (time.time() - start_time)
        logs = logs + f'train duration {execution_time} \n'
        r2 = r2_score(ytest, predictions)
        logs = logs + f'r2:  {r2}\n '
        print(logs)
        with open('./logs'+file_name+'.txt', 'w') as f:
            f.write(logs)
        dump(pipe, model_folder+file_name+'.joblib')
    print('Model trained')

In [13]:
X = data.drop(target,axis=1)
y = data[target]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [15]:
for pipe in pipelines:
    fit_pipeline(pipe,X_train, X_test, y_train, y_test,True)

linearregression
Training started
Model linearregression
train duration 2.690599203109741 
r2:  0.4067735668570026
 
Model trained
decisiontreeregressor
Training started
Model decisiontreeregressor
train duration 4.651044130325317 
r2:  0.3429276193333586
 
Model trained
randomforestregressor
Training started
Model randomforestregressor
train duration 13.120703220367432 
r2:  0.6560176154570969
 
Model trained
adaboostregressor
Training started
Model adaboostregressor
train duration 9.27309250831604 
r2:  0.3267355158087105
 
Model trained
xgbregressor
Training started
Model xgbregressor
train duration 7.150604248046875 
r2:  0.6079513596594885
 
Model trained
