In [1]:
import pandas as pd

# Reading raw data 

In [2]:
with open("data/car.c45-names") as f:
    print(f.read())

| names file (C4.5 format) for car evaluation domain

| class values

unacc, acc, good, vgood

| attributes

buying:   vhigh, high, med, low.
maint:    vhigh, high, med, low.
doors:    2, 3, 4, 5more.
persons:  2, 4, more.
lug_boot: small, med, big.
safety:   low, med, high.



In [3]:
with open("data/car.names") as f:
    print(f.read())

1. Title: Car Evaluation Database

2. Sources:
   (a) Creator: Marko Bohanec
   (b) Donors: Marko Bohanec   (marko.bohanec@ijs.si)
               Blaz Zupan      (blaz.zupan@ijs.si)
   (c) Date: June, 1997

3. Past Usage:

   The hierarchical decision model, from which this dataset is
   derived, was first presented in 

   M. Bohanec and V. Rajkovic: Knowledge acquisition and explanation for
   multi-attribute decision making. In 8th Intl Workshop on Expert
   Systems and their Applications, Avignon, France. pages 59-78, 1988.

   Within machine-learning, this dataset was used for the evaluation
   of HINT (Hierarchy INduction Tool), which was proved to be able to
   completely reconstruct the original hierarchical model. This,
   together with a comparison with C4.5, is presented in

   B. Zupan, M. Bohanec, I. Bratko, J. Demsar: Machine learning by
   function decomposition. ICML-97, Nashville, TN. 1997 (to appear)

4. Relevant Information Paragraph:

   Car Evaluation Database was 

- buying       v-high, high, med, low
- maint        v-high, high, med, low
- doors        2, 3, 4, 5-more
- persons      2, 4, more
- lug_boot     small, med, big
- safety       low, med, high

# Preparing Data

In [4]:
col_names = ['buying','maint','doors','persons','lug_boot','safety','class_value']

df = pd.read_csv("data/car.data", names=col_names)
df=df.drop('persons', axis=1) # drop since this is not used
df

Unnamed: 0,buying,maint,doors,lug_boot,safety,class_value
0,vhigh,vhigh,2,small,low,unacc
1,vhigh,vhigh,2,small,med,unacc
2,vhigh,vhigh,2,small,high,unacc
3,vhigh,vhigh,2,med,low,unacc
4,vhigh,vhigh,2,med,med,unacc
...,...,...,...,...,...,...
1723,low,low,5more,med,med,good
1724,low,low,5more,med,high,vgood
1725,low,low,5more,big,low,unacc
1726,low,low,5more,big,med,good


In [5]:
# balancing df since class_value label is very imbalanced as highlighted in cars.names

balanced_df = pd.DataFrame()

for value in df.class_value.unique():
    balanced_df = pd.concat([balanced_df, df[df['class_value']==value].sample(65)])
    
df = balanced_df.copy()

In [6]:
# encoding to catagorize variables

df['doors'] = df['doors'].apply(lambda x: 5 if x == "5-more" else x)

def score_to_integers(x):
    if x in ["vhigh", "vgood", "5more"]:
        return 4
    elif x in ["high", "big", "good", "more"]:
        return 3
    elif x in ["med", "acc"]:
        return 2
    elif x in ["low", "small", "unacc"]:
        return 1
    else:
        return x
    
for col in df.columns:
    df[col] = df[col].apply(score_to_integers)

df

Unnamed: 0,buying,maint,doors,lug_boot,safety,class_value
1000,2,3,3,1,2,1
972,2,3,2,1,1,1
108,4,3,2,1,1,1
1412,1,3,2,3,3,1
748,3,2,4,1,2,1
...,...,...,...,...,...,...
1523,1,2,2,1,3,3
1609,1,2,4,3,2,3
1226,2,1,3,1,3,3
1669,1,1,3,2,2,3


# Build Model 

In [7]:
from sklearn.model_selection import train_test_split
X=df[['maint','doors','lug_boot','safety','class_value']]
y=df['buying']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
predict_set = pd.Series(['high',4,'big','high','good']).apply(score_to_integers)
list(predict_set)

[3, 4, 3, 3, 3]

In [9]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_leaf_nodes=5, random_state=0).fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))
print("Prediction", clf.predict([predict_set])[0])

Accuracy of Decision Tree classifier on training set: 0.55
Accuracy of Decision Tree classifier on test set: 0.52
Prediction 1


In [10]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))
print("Prediction", clf.predict([predict_set])[0])

Accuracy of Logistic regression classifier on training set: 0.48
Accuracy of Logistic regression classifier on test set: 0.50
Prediction 1


prediction of 1 here refers to "low" based on the encoding defined earlier