In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("car.data", header=None)
df.columns = ['buying','maint','doors','persons','lug_boot','safety','class']
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
cols = ['buying','maint','doors','persons','lug_boot','safety','class']
df = pd.read_csv("car.data", names=cols)
df.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
df['persons'].unique()

array(['2', '4', 'more'], dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder

### label encoder is only for one column!!!!

In [7]:
le_df = df.copy()
le = LabelEncoder()
for col in le_df.columns:
    le_df[col] = le.fit_transform(le_df[col])
le_df.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
5,3,3,0,0,1,0,2
6,3,3,0,0,0,1,2
7,3,3,0,0,0,2,2
8,3,3,0,0,0,0,2
9,3,3,0,1,2,1,2


In [8]:
df['lug_boot'].unique()

array(['small', 'med', 'big'], dtype=object)

In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [10]:
categories = [
    ['low', 'med', 'high', 'vhigh'],   # buying
    ['low', 'med', 'high', 'vhigh'],   # maint
    ['2', '3', '4', '5more'],          # doors
    ['2', '4', 'more'],                # persons
    ['small', 'med', 'big'],           # lug_boot
    ['low', 'med', 'high'],            # safety
    ['unacc','acc', 'good', 'vgood']   # class
]

In [11]:
oe = OrdinalEncoder(categories=categories)
oe_df = oe.fit_transform(df)

oe_df = pd.DataFrame(oe_df, columns=['buying','maint','doors','persons','lug_boot','safety','class'])

oe_df.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
1,3.0,3.0,0.0,0.0,0.0,1.0,0.0
2,3.0,3.0,0.0,0.0,0.0,2.0,0.0
3,3.0,3.0,0.0,0.0,1.0,0.0,0.0
4,3.0,3.0,0.0,0.0,1.0,1.0,0.0
5,3.0,3.0,0.0,0.0,1.0,2.0,0.0
6,3.0,3.0,0.0,0.0,2.0,0.0,0.0
7,3.0,3.0,0.0,0.0,2.0,1.0,0.0
8,3.0,3.0,0.0,0.0,2.0,2.0,0.0
9,3.0,3.0,0.0,1.0,0.0,0.0,0.0


In [12]:
X, y = np.array(oe_df.drop(['class'], axis=1)), np.array(oe_df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [13]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(acc)

0.9393063583815029


In [14]:
X, y = np.array(le_df.drop(['class'], axis=1)), np.array(le_df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [15]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(acc)

0.8988439306358381
