In [1]:
#Import Required Libraries
import pandas as pd
import scipy as sc
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
#Load the Dataset
data = pd.read_csv('car.csv')
data.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [3]:
#Add the Column names
Column_names = ['Buying','Maint','Doors','Persons','Lug_boot','Safety','Class']
data = pd.read_csv('car.csv',names=Column_names)

In [4]:
data.head(10)

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety,Class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [5]:
#Check the Datatypes for each columns in the dataset.
print(data.dtypes)

Buying      object
Maint       object
Doors       object
Persons     object
Lug_boot    object
Safety      object
Class       object
dtype: object


In [6]:
#Identify the size of the dataset.
data.shape, data.columns

((1728, 7),
 Index(['Buying', 'Maint', 'Doors', 'Persons', 'Lug_boot', 'Safety', 'Class'], dtype='object'))

In [7]:
# Select feature Variable
X = data.iloc[:,:-1]
X

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med
...,...,...,...,...,...,...
1723,low,low,5more,more,med,med
1724,low,low,5more,more,med,high
1725,low,low,5more,more,big,low
1726,low,low,5more,more,big,med


In [8]:
#select target variable (Class)
y=data.iloc[:,-1:]
y

Unnamed: 0,Class
0,unacc
1,unacc
2,unacc
3,unacc
4,unacc
...,...
1723,good
1724,vgood
1725,unacc
1726,good


In [9]:
#Split the data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
#Check the Shape of training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 6), (346, 6), (1382, 1), (346, 1))

In [11]:
X_train

Unnamed: 0,Buying,Maint,Doors,Persons,Lug_boot,Safety
1579,low,med,4,4,med,med
634,high,high,5more,4,med,med
299,vhigh,med,5more,2,small,high
1085,med,med,2,2,med,high
1659,low,low,3,4,med,low
...,...,...,...,...,...,...
715,high,med,4,4,med,med
905,med,vhigh,3,4,med,high
1096,med,med,2,4,big,med
235,vhigh,med,2,more,small,med


In [12]:
#Encode the Feature variables to convert from Object type to Float type
enc = OrdinalEncoder()
X_train = enc.fit_transform(X_train)

In [13]:
X_test = enc.fit_transform(X_test)

In [14]:
X_train, X_test

(array([[1., 2., 2., 1., 1., 2.],
        [0., 0., 3., 1., 1., 2.],
        [3., 2., 3., 0., 2., 0.],
        ...,
        [2., 2., 0., 1., 0., 2.],
        [3., 2., 0., 2., 2., 2.],
        [2., 0., 3., 0., 0., 0.]]), array([[2., 1., 1., 2., 2., 1.],
        [0., 0., 1., 2., 0., 2.],
        [0., 0., 3., 0., 1., 2.],
        ...,
        [3., 1., 2., 0., 0., 0.],
        [3., 3., 1., 2., 1., 2.],
        [3., 3., 3., 1., 1., 2.]]))

In [15]:
y_train, y_test

(      Class
 1579   good
 634     acc
 299   unacc
 1085  unacc
 1659  unacc
 ...     ...
 715     acc
 905     acc
 1096    acc
 235   unacc
 1061  unacc
 
 [1382 rows x 1 columns],       Class
 1233  unacc
 592     acc
 625   unacc
 1546  unacc
 730   unacc
 ...     ...
 1433  unacc
 1238  vgood
 386   unacc
 49    unacc
 94    unacc
 
 [346 rows x 1 columns])

In [16]:
#Test Different algorithms to identify which Classification algorithm best suits this porblem.
# Use Stratified KFold cross validation technique.
#CHeck the Accuracy for each algorithm.
from sklearn.model_selection import StratifiedKFold
models = []
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))

results = []
names = []

for name, model in models:
    kfolds = StratifiedKFold(n_splits = 10, random_state=1, shuffle=True)
    val_result = cross_val_score(model, X_train, y_train, cv=kfolds, scoring='accuracy')
    results.append(val_result)
    names.append(name)
    print('%s: %f (%f)' % (name, val_result.mean(), val_result.std()))

CART: 0.976857 (0.018245)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


KNN: 0.905968 (0.020691)
NB: 0.623083 (0.028079)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [17]:
#It is clear that Decision Tree is the best Algorithm for this problem as it has highest accuracy.
for name,result in zip(names, results):
    print('%s: %f (%f)' % (name, result.mean(), result.std()))

CART: 0.976857 (0.018245)
KNN: 0.905968 (0.020691)
NB: 0.623083 (0.028079)


In [18]:
#Select the Decision Tree Algorithm as it gives best accuracy.
#Evaluate it with different parameter values to ensure that it do not overfits.
CART_model = DecisionTreeClassifier(max_depth=10, random_state=0)
CART_model.fit(X_train,y_train)
predictions = CART_model.predict(X_test)

In [19]:
#Check the Accuracy
print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

0.9624277456647399
[[ 57   0   2   0]
 [  1  12   1   0]
 [  7   0 253   0]
 [  1   1   0  11]]
              precision    recall  f1-score   support

         acc       0.86      0.97      0.91        59
        good       0.92      0.86      0.89        14
       unacc       0.99      0.97      0.98       260
       vgood       1.00      0.85      0.92        13

    accuracy                           0.96       346
   macro avg       0.94      0.91      0.92       346
weighted avg       0.96      0.96      0.96       346



In [20]:
#Check the Accuracy score for training set and test set to identify if it Overfits.
#As both the score quite comparable, the Model do not overfits.
#Hence, We can use this model for predictions.
print('Training set score: {:.4f}'.format(CART_model.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(CART_model.score(X_test, y_test)))

Training set score: 0.9877
Test set score: 0.9624
