# Классификационное дерево
Для примера будем использовать датасет, предсказывающий класс автомобиля по его характеристикам

In [1]:
import pandas as pd

df = pd.read_csv('./car_evaluation.csv')

col_names = ['Buying price', 'Maintenance cost', 'Doors', 'Persons', 'lug_boot', 'Safety', 'Class']
df.columns = col_names
df

Unnamed: 0,Buying price,Maintenance cost,Doors,Persons,lug_boot,Safety,Class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Buying price      1727 non-null   object
 1   Maintenance cost  1727 non-null   object
 2   Doors             1727 non-null   object
 3   Persons           1727 non-null   object
 4   lug_boot          1727 non-null   object
 5   Safety            1727 non-null   object
 6   Class             1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [3]:
df.describe(include=['O'])

Unnamed: 0,Buying price,Maintenance cost,Doors,Persons,lug_boot,Safety,Class
count,1727,1727,1727,1727,1727,1727,1727
unique,4,4,4,3,3,3,4
top,high,high,3,4,med,med,unacc
freq,432,432,432,576,576,576,1209


#### Нулей нет, кол-во классов каждого признака, кроме Class примерно поровну

##  Закодируем классы

In [4]:
for col in col_names:
    
    print(df[col].value_counts())   

high     432
med      432
low      432
vhigh    431
Name: Buying price, dtype: int64
high     432
med      432
low      432
vhigh    431
Name: Maintenance cost, dtype: int64
3        432
4        432
5more    432
2        431
Name: Doors, dtype: int64
4       576
more    576
2       575
Name: Persons, dtype: int64
med      576
big      576
small    575
Name: lug_boot, dtype: int64
med     576
high    576
low     575
Name: Safety, dtype: int64
unacc    1209
acc       384
good       69
vgood      65
Name: Class, dtype: int64


In [5]:
df = df.replace(['low', 'med', 'high', 'vhigh'], [0, 1, 2, 3])
df = df.replace(['5more', 'more'], [5, 5])
df = df.replace(['small', 'big'], [0, 2])
df

Unnamed: 0,Buying price,Maintenance cost,Doors,Persons,lug_boot,Safety,Class
0,3,3,2,2,0,1,unacc
1,3,3,2,2,0,2,unacc
2,3,3,2,2,1,0,unacc
3,3,3,2,2,1,1,unacc
4,3,3,2,2,1,2,unacc
...,...,...,...,...,...,...,...
1722,0,0,5,5,1,1,good
1723,0,0,5,5,1,2,vgood
1724,0,0,5,5,2,0,unacc
1725,0,0,5,5,2,1,good


In [6]:
from sklearn.model_selection import train_test_split

X = df.drop(['Class'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape

((1295, 6), (432, 6))

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=0)


tree.fit(X_train, y_train)
accuracy_score(y_test, tree.predict(X_test))

0.8217592592592593