### Data Preprocessing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('All cars data.csv')

In [3]:
df.shape

(105251, 10)

In [4]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,company name
0,A1,2017,12500.0,Manual,15735.0,Petrol,150.0,55.4,1.4,audi
1,A6,2016,16500.0,Automatic,36203.0,Diesel,20.0,64.2,2.0,audi
2,A1,2016,11000.0,Manual,29946.0,Petrol,30.0,55.4,1.4,audi
3,A4,2017,16800.0,Automatic,25952.0,Diesel,145.0,67.3,2.0,audi
4,A3,2019,17300.0,Manual,1998.0,Petrol,145.0,49.6,1.0,audi


In [5]:
print(df['company name'].unique())

['audi' 'bmw' 'cclass' 'focus' 'ford' 'hyundi' 'merc' 'skoda' 'toyota'
 'vauxhall' 'vw']


In [6]:
df.transmission.unique()

array(['Manual', 'Automatic', 'Semi-Auto'], dtype=object)

In [7]:
df.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid'], dtype=object)

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df.model=le.fit_transform(df.model)
mapping_model = dict(zip(le.classes_, range(len(le.classes_))))

df['company name']=le.fit_transform(df['company name'])
mapping_companyName = dict(zip(le.classes_, range(len(le.classes_))))

df.fuelType=le.fit_transform(df.fuelType)
mapping_fuelType = dict(zip(le.classes_, range(len(le.classes_))))

In [24]:
print(mapping_model)
print('\n')
print(mapping_companyName)
print('\n')
print(mapping_fuelType)
print('\n')

{'1 Series': 0, '180': 1, '2 Series': 2, '200': 3, '220': 4, '3 Series': 5, '4 Series': 6, '5 Series': 7, '6 Series': 8, '7 Series': 9, 'A Class': 10, 'A1': 11, 'A3': 12, 'A4': 13, 'A5': 14, 'A6': 15, 'A7': 16, 'A8': 17, 'Adam': 18, 'Agila': 19, 'Amarok': 20, 'Antara': 21, 'Arteon': 22, 'Astra': 23, 'Auris': 24, 'Avensis': 25, 'Aygo': 26, 'B Class': 27, 'B-MAX': 28, 'Beetle': 29, 'C Class': 30, 'C-HR': 31, 'C-MAX': 32, 'CC': 33, 'CL Class': 34, 'CLA Class': 35, 'CLS Class': 36, 'Caddy': 37, 'Caddy Life': 38, 'Caddy Maxi': 39, 'Caddy Maxi Life': 40, 'Camry': 41, 'Caravelle': 42, 'Cascada': 43, 'Citigo': 44, 'Combo Life': 45, 'Corolla': 46, 'Corsa': 47, 'Crossland X': 48, 'E Class': 49, 'EcoSport': 50, 'Edge': 51, 'Eos': 52, 'Fabia': 53, 'Fiesta': 54, 'Focus': 55, 'GL Class': 56, 'GLA Class': 57, 'GLB Class': 58, 'GLC Class': 59, 'GLE Class': 60, 'GLS Class': 61, 'GT86': 62, 'GTC': 63, 'Galaxy': 64, 'Golf': 65, 'Golf SV': 66, 'Grand C-MAX': 67, 'Grand Tourneo Connect': 68, 'Grandland X':

In [10]:
# renaming transmission values

transmition_map = {'Manual':1, 'Semi-Auto':2, 'Automatic':3}

df['transmission'] = df['transmission'].map(transmition_map)
df.transmission.unique()

array([1, 3, 2], dtype=int64)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
cols=['year','mileage','tax','mpg', 'engineSize','model','transmission','fuelType','company name']
df[cols]=scaler.fit_transform(df[cols])
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,company name
0,-1.258274,-0.181791,12500.0,-0.766775,-0.291621,0.836998,0.469565,0.065162,-0.417644,-1.607517
1,-1.160059,-0.78045,16500.0,1.78783,1.009277,-1.217358,-1.512089,1.030445,0.862212,-1.607517
2,-1.258274,-0.78045,11000.0,-0.766775,0.611597,0.836998,-1.359654,0.065162,-0.417644,-1.607517
3,-1.209166,-0.181791,16800.0,1.78783,0.357747,-1.217358,0.393347,1.370488,0.862212,-1.607517
4,-1.23372,1.015526,17300.0,-0.766775,-1.164713,0.836998,0.393347,-0.571048,-1.270881,-1.607517


In [12]:
from sklearn.model_selection import train_test_split

x = df.drop('price', axis=1)
y = df[['price']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15)
x_train.shape

(89463, 9)

## Modeling 

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import r2_score

In [14]:
models = {'LR' : LinearRegression(), 
          'DT' : DecisionTreeRegressor(), 
          'RF' : RandomForestRegressor(), 
          'XGB' : XGBRegressor(), 
          'KNN' : KNeighborsRegressor()}

In [15]:
for name, model in models.items() :
    print(name)
    print('-'*10)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    ## score 
    print('Train Score', r2_score(y_train, model.predict(x_train)))
    print('Test Score', r2_score(y_test, y_pred))
    print('='*50)

LR
----------
Train Score 0.7630044064232121
Test Score 0.7636123894212299
DT
----------
Train Score 0.9992258787849159
Test Score 0.9348811635974065
RF
----------


  model.fit(x_train, y_train)


Train Score 0.9936067254949833
Test Score 0.959258056751898
XGB
----------
Train Score 0.9579389091072523
Test Score 0.9523785771342558
KNN
----------
Train Score 0.960959425539658
Test Score 0.9413450074460541


## parameter tuning

### I made it before on cloud but it took too long so i will run this code again

In [16]:
# RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
#                    param_distributions={'max_depth': [100, 50, 20, 10],
#                                         'min_samples_leaf': [2, 5, 7, 11],
#                                         'n_estimators': [600, 400, 250]},

# {'n_estimators': 250, 'min_samples_leaf': 2, 'max_depth': 100}

# 0.9857227615285351
# 0.9559854036971698
# #####################################################################
# RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
#  			param_distributions={'max_depth': [5, 6, 8, 10],
#  			'n_estimators': [100, 200, 250]}, scoring='r2')

# {'n_estimators': 250, 'max_depth': 10}
# #######################################################################3
# RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
#                    param_distributions={'max_depth': [100, 200, 250],
#                                         'min_samples_leaf': [2, 4],
#                                         'n_estimators': [260, 240, 250]},

# {'n_estimators': 260, 'min_samples_leaf': 2, 'max_depth': 200}

# 0.9857151681468798
# 0.9558541509235499
# ###########################################################################3
# model = RandomForestRegressor(n_estimators=260, max_depth= 170, min_samples_leaf=2)
# model.fit(x_train, y_train)
# r2_score(y_train, model.predict(x_train))

# 0.9857383090011874
# 0.9559186526267716

### Random forest

In [17]:
model = RandomForestRegressor(n_estimators=250, max_depth= 100, min_samples_leaf=2)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2_score(y_train, model.predict(x_train))

  model.fit(x_train, y_train)


0.9859287697778796

In [18]:
r2_score(y_test, y_pred)

0.9588149799543981

## Exporting model & scaler

In [19]:
import joblib 

In [20]:
joblib.dump(model, 'model.h5') 

['model.h5']

In [21]:
joblib.dump(scaler, 'scaler.h5') 

['scaler.h5']

In [22]:
x.columns

Index(['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg',
       'engineSize', 'company name'],
      dtype='object')