In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_percentage_error,r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import PartialDependenceDisplay
import category_encoders as ce
import scikitplot as skplt

import copy
import time
import os
import threading
from pathlib import Path
from joblib import dump, load
import re
import random

In [70]:
target = 'price'
categorical_columns = ['city','layout_type','property_type','furnish_type','seller_type','locality']
continuous_columns = ['bedroom','area','bathroom']

In [71]:
data = pd.read_csv('./data/data.csv')
#data[target] = np.log(data[target])

In [72]:
data

Unnamed: 0,seller_type,bedroom,layout_type,property_type,locality,price,area,furnish_type,bathroom,city
0,OWNER,2.0,BHK,Apartment,Bodakdev,20000.0,1450.0,Furnished,2.0,Ahmedabad
1,OWNER,1.0,RK,Studio Apartment,CG Road,7350.0,210.0,Semi-Furnished,1.0,Ahmedabad
2,OWNER,3.0,BHK,Apartment,Jodhpur,22000.0,1900.0,Unfurnished,3.0,Ahmedabad
3,OWNER,2.0,BHK,Independent House,Sanand,13000.0,1285.0,Semi-Furnished,2.0,Ahmedabad
4,OWNER,2.0,BHK,Independent House,Navrangpura,18000.0,1600.0,Furnished,2.0,Ahmedabad
...,...,...,...,...,...,...,...,...,...,...
98776,OWNER,1.0,BHK,Apartment,Chinchwad,13250.0,650.0,Semi-Furnished,1.0,Pune
98777,OWNER,1.0,RK,Studio Apartment,Wadgaon Sheri,7000.0,350.0,Unfurnished,1.0,Pune
98778,OWNER,1.0,BHK,Apartment,Kothrud,8500.0,389.0,Semi-Furnished,1.0,Pune
98779,AGENT,1.0,BHK,Apartment,Wakad,16500.0,670.0,Semi-Furnished,2.0,Pune


In [73]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

catboost_transformer = Pipeline(steps=[
    ('encoder', ce.CatBoostEncoder())])

one_hot_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder())])

In [74]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_columns),
        ('cat1', catboost_transformer, categorical_columns[-1:]),
        ('cat2', one_hot_transformer, categorical_columns[0:-1])
    ])

In [87]:
regressors = [LinearRegression(n_jobs=-1),
              DecisionTreeRegressor(),
              RandomForestRegressor(n_jobs=-1),
              AdaBoostRegressor(),
              #SVR(),
              XGBRegressor(n_jobs=-1)]

In [88]:
pipelines = []
        
for regressor in regressors:
    pipelines.append(make_pipeline(preprocessor,regressor,memory='./cache'))

In [89]:
def fit_pipeline(pipe,xtrain,xtest,ytrain,ytest,overwrite_saved_models=False):
    model_folder = './saved_models/'
    file_name = re.sub('[^a-zA-Z0-9 \n\.]', '', list(pipe.named_steps.keys())[-1])
    print(file_name)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not Path(model_folder+file_name+'.joblib').is_file() or overwrite_saved_models:
        start_time = time.time()
        logs = 'Model '+file_name+'\n'
        print('Training started')
        pipe.fit(xtrain, ytrain)
        predictions = pipe.predict(xtest)
        execution_time = (time.time() - start_time)
        logs = logs + f'train duration {execution_time} \n'
        r2 = r2_score(ytest, predictions)
        logs = logs + f'r2:  {r2}\n '
        print(logs)
        with open('./logs'+file_name+'.txt', 'w') as f:
            f.write(logs)
        dump(pipe, model_folder+file_name+'.joblib')
    print('Model trained')

In [90]:
X = data.drop(target,axis=1)
y = data[target]

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [92]:
for pipe in pipelines:
    fit_pipeline(pipe,X_train, X_test, y_train, y_test,True)

linearregression
Training started
Model linearregression
train duration 1.3933022022247314 
r2:  0.7185866218984773
 
Model trained
decisiontreeregressor
Training started
Model decisiontreeregressor
train duration 1.6883883476257324 
r2:  0.6534644921037569
 
Model trained
randomforestregressor
Training started
Model randomforestregressor
train duration 3.1575446128845215 
r2:  0.8052847471528121
 
Model trained
adaboostregressor
Training started
Model adaboostregressor
train duration 4.31696629524231 
r2:  0.2815369025436222
 
Model trained
xgbregressor
Training started
Model xgbregressor
train duration 3.5307908058166504 
r2:  0.8212923056378048
 
Model trained


El XGBoost tuvo el score mas alto

In [95]:
param = {'xgbregressor__max_depth': [3,7, 10], 
         'xgbregressor__n_estimators':[50,150,300],         
        }

#seleccionar el modelo para el crossvalidation
modelname = 'xgbregressor'

reg = load(f'./saved_models/{modelname}.joblib') 


cf_file = './saved_models/cv.joblib'

if not Path(cf_file).is_file():
    CV = RandomizedSearchCV(reg, param, n_jobs=2,n_iter =5, cv=5)
    start_time = time.time()
    CV.fit(X_train, y_train)  
    print(f'training duration{(time.time() - start_time)} s')
    dump(CV, cf_file)
else:
    CV = load(cf_file)

print(CV.best_params_)    
print(CV.best_score_)

training duration66.15009379386902 s
{'xgbregressor__n_estimators': 50, 'xgbregressor__max_depth': 7}
0.8160851132190361
