In [1]:
from sklearn import set_config
set_config(display="text")
import warnings
warnings.simplefilter('ignore')

In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import dill
%matplotlib inline

In [3]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Name,Location,Kms_driven,Fuel_type,Owner,Year,Price
0,Ford Figo Duratec,Bangalore,35056,Petrol,0,2015,380000
1,Maruti Suzuki Wagon,Bangalore,44000,Petrol,0,2016,465000
2,Hyundai Creta 1.6,Bangalore,42917,Petrol,0,2018,1350000
3,Hyundai Venue,Chennai,16112,Petrol,2,2019,1019699
4,Honda Jazz,Pune,30988,Petrol,2,2017,713499


In [11]:
data['Year'].unique()

array([2015, 2016, 2018, 2019, 2017, 2013, 2012, 2020, 2014, 2011, 2021,
       2010, 2001, 2022, 2000, 2008, 2005, 2007, 2006, 2004, 2002, 2009,
       2003], dtype=int64)

In [6]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score

In [8]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'XGBRegressor':XGBRegressor(),
    'CatBoostRegressor':CatBoostRegressor(verbose=False),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'SVR':SVR()
}

In [9]:
params={
    'LinearRegression':{},
    'Ridge':{'alpha':[0.1, 0.2, 0.5, 0.7,1,5,10,20]},
    'Lasso':{'alpha':[0.1, 0.2, 0.5, 0.7,1,5,10,20]},
    'KNeighborsRegressor':{
        'n_neighbors' : [5,7,9,11,13,15],
        #'weights' : ['uniform','distance'],
        #'metric' : ['minkowski','euclidean','manhattan'],
        'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'DecisionTreeRegressor':{
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        #The function to measure the quality of a split.
        #'splitter':['best','random'],
        #The strategy used to choose the split at each node. 
        'max_depth':range(2,25,1),
        #the maximum depth of tree
        'min_samples_split':range(2,20,1),
        #minimum samples required for splitting further
        'min_samples_leaf':range(1,15,1),
        #minimum samples required to be a leaf
        'max_features':['sqrt','log2']
        #The number of features to consider when looking for the best split:
    },
    'RandomForestRegressor':{
#        'n_estimators':range(10, 100, 10),
        'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
#         'max_depth':range(3, 25, 1),
#         'min_samples_split':range(2, 15, 1),
#         'min_samples_leaf':range(1, 15, 1),
        'max_features':['sqrt','log2']
    },
    'GradientBoostingRegressor':{
#        'n_estimators':range(25, 500, 25),
        'loss':['squared_error', 'absolute_error', 'huber','quantile'],
        'criterion':['friedman_mse','squared_error'],
#         'max_depth':range(3, 25, 1),
#         'min_samples_split':range(2, 15, 1),
#         'min_samples_leaf':range(1, 15, 1),
#         'learning_rate': [1,0.5,.1, .01, .05, .001],
#         'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
        'max_features':['sqrt','log2']
    },
    'XGBRegressor':{
#        'n_estimators':range(25, 500, 25),
        'learning_rate': [1,0.5,.1, .01, .05, .001]
    },
    'CatBoostRegressor':{
        'depth': [6, 8, 10],
        'learning_rate': [1,0.5,.1, .01, .05, .001],
        'iterations': [30, 50, 100]
    },
    'SVR':{
        'C': [0.1, 1, 10, 100, 1000], 
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'kernel': ['rbf', 'sigmoid']
    }
}

In [10]:
def transformer_pipe():
    numeric_features = ['Kms_driven', 'Owner', 'Year']
    categorical_features = ['Name', 'Location', 'Fuel_type']
    num_pipe = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )
    cat_pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder()
    )
    transformer_obj = ColumnTransformer([
        ('numerical', num_pipe, numeric_features),
        ('categorical', cat_pipe, categorical_features)
    ])
    return transformer_obj

In [11]:
target = 'Price'

In [12]:
transformer_obj = transformer_pipe()
processed_data = transformer_obj.fit_transform(data.drop(target, axis=1)).toarray()
with open('preprocessor.pkl', 'wb') as file:
    dill.dump(transformer_obj, file)

In [13]:
raw_array = pd.DataFrame(np.column_stack((processed_data, data[[target]])))
raw_array.to_csv('processed_array.csv', index=None, header=True)
x = raw_array.iloc[:, :-1]
y = raw_array.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=61)

In [14]:
for i in models:
    model = models[i]
    param = params[i]
    
    model.fit(x, y)
    y_pred_train = model.predict(x)
    y_pred_test = model.predict(x_train)
    print(i, r2_score(y, y_pred_train)*100, r2_score(y_train, y_pred_test)*100)

LinearRegression 94.40389372528644 95.09278949885231
Ridge 84.72419979156565 86.23172806285407
Lasso 94.40381860836112 95.09298575075152
KNeighborsRegressor 59.34010384821696 59.4929879231405
DecisionTreeRegressor 99.63778428708689 99.99412265594052
RandomForestRegressor 92.97311066157907 93.49900522609596
XGBRegressor 96.57236032154582 97.02322678769532
CatBoostRegressor 92.65619298498062 93.26307915484212
GradientBoostingRegressor 82.13110704830036 82.77832661547107
SVR -5.548068695499109 -5.166799199381833


In [15]:
mymodel = DecisionTreeRegressor()
mymodel.fit(x, y)
with open('model.pkl', 'wb') as file:
    dill.dump(mymodel, file)

In [16]:
with open('model.pkl', 'rb') as file:
    loader = dill.load(file)

In [17]:
y_pred = loader.predict(x_test)

In [18]:
r2_score(y_test, y_pred)*100

98.29230541895816

In [19]:
with open('preprocessor.pkl', 'rb') as file:
    processor = dill.load(file)

In [20]:
new_data = processor.transform(df_test_x)

In [21]:
loader.predict(new_data)

array([ 380000.,  465000., 1350000., 1019699.,  713499.,  391099.,
        474299., 1252999.,  393699.,  730899.])