# Decision Tree

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split
import time

## Read in the data

In [2]:
df = pd.read_csv('cardata_cleaned.csv',names=['buying','maint','doors','persons','lug_boot','safety','class'], skiprows=[0])
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,2,2,0,0,0
1,3,3,2,2,0,1,0
2,3,3,2,2,0,2,0
3,3,3,2,2,1,0,0
4,3,3,2,2,1,1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null int64
maint       1728 non-null int64
doors       1728 non-null int64
persons     1728 non-null int64
lug_boot    1728 non-null int64
safety      1728 non-null int64
class       1728 non-null int64
dtypes: int64(7)
memory usage: 94.6 KB


In [4]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728.0,1728.0,1728.0,1728.0,1728.0,1728.0,1728.0
mean,1.5,1.5,3.5,3.666667,1.0,1.0,0.414931
std,1.118358,1.118358,1.118358,1.24758,0.816733,0.816733,0.7407
min,0.0,0.0,2.0,2.0,0.0,0.0,0.0
25%,0.75,0.75,2.75,2.0,0.0,0.0,0.0
50%,1.5,1.5,3.5,4.0,1.0,1.0,0.0
75%,2.25,2.25,4.25,5.0,2.0,2.0,1.0
max,3.0,3.0,5.0,5.0,2.0,2.0,3.0


## Encode as categorical variables

In [5]:
df['buying'],_ = pd.factorize(df['buying'])
df['maint'],_ = pd.factorize(df['maint'])
df['doors'],_ = pd.factorize(df['doors'])
df['persons'],_ = pd.factorize(df['persons'])
df['lug_boot'],_ = pd.factorize(df['lug_boot'])
df['safety'],_ = pd.factorize(df['safety'])

In [6]:
df['class'],classNames = pd.factorize(df['class'])

In [7]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,2,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,1,0


## Split data into X and y categories for Decision Tree

In [8]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

## split data into 70% training set and 30% test set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Train the Decision Tree

In [10]:
start_time = time.time()
carEvalTree = tree.DecisionTreeClassifier(criterion='gini', max_depth=7)
carEvalTree.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.003987550735473633 seconds ---


## Find the mean accuracy of the given test data and labels.

In [11]:
carEvalTree.score(X_train,y_train)

0.9478908188585607

## The greater the max_depth, the higher the mean accuracy

## Make predictions with data

In [12]:
y_pred = carEvalTree.predict(X_test)

## Check accuracy of Decision Tree

In [13]:
misclassified = (y_test != y_pred).sum()
print('# of improperly classified samples in Decision Tree: {}'.format(misclassified))
acc = metrics.accuracy_score(y_test, y_pred)
print('Percent accuracy of Decision Tree: {:.2%}'.format(acc))

# of improperly classified samples in Decision Tree: 43
Percent accuracy of Decision Tree: 91.71%
