In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import io
from google.colab import files

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [None]:
df = pd.read_csv('/car.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [None]:
df = df.drop('name', axis = 1)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [None]:
#dummy variables
df_cat=df[['fuel','seller_type','transmission','owner']]
df_cat=pd.get_dummies(data=df_cat,drop_first=True)
df=df[['year','selling_price','km_driven']].join(df_cat)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,60000,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,135000,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,600000,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,250000,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,450000,141000,1,0,0,0,1,0,1,0,1,0,0


In [None]:
#podzial na zmienną zależną i zmienne niezależne
#y - selling price

y=df['selling_price']
X=df.drop('selling_price',axis=1)

In [None]:
#zbuduj 2 modele
#drzewo decyzyjne
#regresja liniowa

from sklearn.linear_model import LinearRegression
from sklearn import tree

#regresja
reg_model=LinearRegression()
reg_model.fit(X,y)
reg_pred=reg_model.predict(X)

#drzewo decyzyjne
tree_model=tree.DecisionTreeRegressor(random_state=111,min_samples_split=400)
tree_model.fit(X,y)
tree_pred=tree_model.predict(X)

In [None]:
#sprawdz skutecznosc modeli na podstawie nastepujacych metryk
#R2

print('Linear Regression R2 score: '+str(round(r2_score(y,reg_pred),2)))
print('Decision Tree R2 score: '+str(round(r2_score(y,tree_pred),2)))

Linear Regression R2 score: 0.46
Decision Tree R2 score: 0.5


In [None]:
#MAE
print('Linear Regression MAE score: '+str(round(mae(y,reg_pred),2)))
print('Decision Tree MAE score: '+str(round(mae(y,tree_pred),2)))

Linear Regression MAE score: 229254.77
Decision Tree MAE score: 198304.84


In [None]:
#RMSE
print('Linear Regression RMSE score: '+str(round(mse(y,reg_pred,squared=False),2)))
print('Decision Tree RMSE score: '+str(round(mse(y,tree_pred,squared=False),2)))

Linear Regression RMSE score: 425388.24
Decision Tree RMSE score: 410557.26


In [None]:
#sprawdz skuteczność modeli dla samochodów napedzanych benzyną

y_petrol=df[df['fuel_Petrol']==1]['selling_price']
X_petrol=df[df['fuel_Petrol']==1].drop('selling_price',axis=1)

reg_pred_petrol = reg_model.predict(X_petrol)
tree_pred_petrol = tree_model.predict(X_petrol)

print('Linear Regression R2 score: '+ str(round(r2_score(y_petrol, reg_pred_petrol),2)))
print('Decision Tree R2 score: '+ str(round(r2_score(y_petrol, tree_pred_petrol),2)))
print('')
print('Linear Regression MAE score: '+ str(round(mae(y_petrol, reg_pred_petrol),2)))
print('Decision Tree MAE score: '+ str(round(mae(y_petrol, tree_pred_petrol),2)))
print('')
print('Linear Regression RMSE score: '+ str(round(mse(y_petrol, reg_pred_petrol, squared=False),2)))
print('Decision Tree RMSE score: '+ str(round(mse(y_petrol, tree_pred_petrol, squared=False),2)))

Linear Regression R2 score: 0.11
Decision Tree R2 score: 0.3

Linear Regression MAE score: 188267.63
Decision Tree MAE score: 134524.7

Linear Regression RMSE score: 343680.95
Decision Tree RMSE score: 303619.18


In [None]:
#podziel zbiór na treningowy i testowy
from sklearn.model_selection import train_test_split

#podział na dane treningowe i testowe.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=111)

In [None]:
#na podstawie zbioru treningowego wybierz hiperparametry dla modelu drzewa decyzyjnego

from sklearn.model_selection import GridSearchCV 

model=DecisionTreeRegressor()
gs=GridSearchCV(model,
                param_grid={'max_depth':range(1,20),
                            'min_samples_split':range(50,100,10)},
                cv=10,
                scoring='neg_mean_absolute_error')
gs.fit(X_train,y_train)

In [None]:
print(gs.best_params_)

{'max_depth': 7, 'min_samples_split': 50}


In [None]:
#przetrenuj model z wybranymi hiperparametrami

tree_model=tree.DecisionTreeRegressor(random_state=111,min_samples_split=50,max_depth=7)
tree_model.fit(X_train,y_train)
tree_pred=tree_model.predict(X_test)
mae(y_test,tree_pred)

172579.00489160747

In [None]:
#sprawdz wyniki na zbiorze testowym