# Decision Tree
Predicting price of the cars

# Reading the dataset

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
A = pd.read_csv("C:/Users/Shruti/Downloads/Cars93.csv")

In [2]:
A.columns=['id', 'Manufacturer', 'Model', 'Type', 'MinPrice', 'Price',
       'MaxPrice', 'MPGcity', 'MPGhighway', 'AirBags', 'DriveTrain',
       'Cylinders', 'EngineSize', 'Horsepower', 'RPM', 'Revpermile',
       'Mantransavail', 'Fueltankcapacity', 'Passengers', 'Length',
       'Wheelbase', 'Width', 'Turncircle', 'Rearseatroom', 'Luggageroom',
       'Weight', 'Origin', 'Make']

# Missing Data Treatment

In [3]:
from myFuctions import replacer
replacer(A)

# Dropping Unwanted Columns

In [4]:
A = A.drop(labels=["id","Make","Model"],axis=1)

# Defining X and Y

In [5]:
X = A.drop(labels=["Price","MinPrice","MaxPrice"],axis=1)
Y = A[["Price"]]

# Feature selection

In [6]:
def ANOVA(df,cat,con):
    from pandas import DataFrame
    from statsmodels.api import OLS
    from statsmodels.formula.api import ols
    rel = con + " ~ " + cat
    model = ols(rel,df).fit()
    from statsmodels.stats.anova import anova_lm
    anova_results = anova_lm(model)
    Q = DataFrame(anova_results)
    a = Q['PR(>F)'][cat]
    return round(a,3)

In [7]:
A.corr()[["Price"]].sort_values(by="Price")

Unnamed: 0,Price
MPGcity,-0.594562
MPGhighway,-0.56068
Revpermile,-0.426395
RPM,-0.004955
Passengers,0.05786
Rearseatroom,0.301888
Luggageroom,0.354635
Turncircle,0.39259
Width,0.456028
Wheelbase,0.500864


In [8]:
imp_cols = []
for i in X.columns:
    if(X[i].dtypes=="object"):
        x = ANOVA(A,i,"Price")
        if(x < 0.05):
            imp_cols.append(i)

In [9]:
imp_cols.append("Horsepower")

# Preprocessing

In [10]:
from myFuctions import preprocessing
Xnew = preprocessing(X)

# Split the Data in training and testing set

In [11]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

# Create Tree based model

In [12]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=21)
model = dtr.fit(xtrain,ytrain)
pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import mean_absolute_error
ts_err = mean_absolute_error(ytest,pred_ts)
tr_err = mean_absolute_error(ytrain,pred_tr)

# Plot the tree

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(dtr,out_file="D:dataScience/notes/dtr.out")
#a notes file will be created in the given location open the file and copy the content from the file paste it in the link below and see the grah
#http://webgraphviz.com/

In [14]:
tr_err

0.0

In [15]:
ts_err

6.647368421052631

# Pruning Tree

In [16]:
for i in range(2,10,1):
    from sklearn.tree import DecisionTreeRegressor
    dtr = DecisionTreeRegressor(random_state=21,max_depth=i)
    model = dtr.fit(xtrain,ytrain)
    pred_tr = model.predict(xtrain)
    pred_ts = model.predict(xtest)

    from sklearn.metrics import mean_absolute_error
    ts_err = mean_absolute_error(ytest,pred_ts)
    ts_err = round(ts_err,3)
    tr_err = mean_absolute_error(ytrain,pred_tr)
    tr_err = round(tr_err,3)
    print(i,tr_err,ts_err)

2 3.687 3.844
3 2.74 4.999
4 1.835 4.645
5 1.244 4.919
6 0.756 4.754
7 0.505 4.913
8 0.213 5.881
9 0.048 5.972


In [17]:
for i in range(2,10,1):
    from sklearn.tree import DecisionTreeRegressor
    dtr = DecisionTreeRegressor(random_state=21,min_samples_leaf=i)
    model = dtr.fit(xtrain,ytrain)
    pred_tr = model.predict(xtrain)
    pred_ts = model.predict(xtest)

    from sklearn.metrics import mean_absolute_error
    ts_err = mean_absolute_error(ytest,pred_ts)
    ts_err = round(ts_err,3)
    tr_err = mean_absolute_error(ytrain,pred_tr)
    tr_err = round(tr_err,3)
    print(i,tr_err,ts_err)

2 0.929 6.067
3 1.99 5.421
4 2.239 5.072
5 2.517 5.044
6 2.633 4.34
7 2.656 4.352
8 2.718 4.385
9 3.638 3.635


# we can also use GridSearch CV to get the best values for pruning

In [18]:
from sklearn.model_selection import GridSearchCV
tp = {"min_samples_split":range(2,20,1)}
dtr = DecisionTreeRegressor(random_state=21)

cv = GridSearchCV(dtr,tp,scoring="neg_mean_absolute_error",cv=4)
cvmodel = cv.fit(Xnew,Y)
cvmodel.best_params_

{'min_samples_split': 17}

In [19]:
dtr = DecisionTreeRegressor(random_state=21,min_samples_split=17)
model = dtr.fit(xtrain,ytrain)

In [22]:
 ytest["predicted"] = model.predict(xtest)

In [23]:
ytest

Unnamed: 0,Price,predicted
23,11.3,11.32
86,22.7,20.7875
91,22.7,15.7875
21,29.5,34.3
17,18.8,34.3
82,8.6,8.371429
34,14.0,11.32
27,25.8,31.575
10,40.1,31.575
40,19.8,16.5125
