In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
# loading the dataset

data = pd.read_csv('Data/drug200.csv')
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [5]:
data.shape

(200, 6)

In [3]:
for i in ['Sex', 'BP', 'Cholesterol', 'Drug']:
    print(data[i].unique())

['F' 'M']
['HIGH' 'LOW' 'NORMAL']
['HIGH' 'NORMAL']
['DrugY' 'drugC' 'drugX' 'drugA' 'drugB']


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [28]:
# spliting the dataset into independent and dependent features

X, y = data.drop('Drug', axis=1), data['Drug']
print(X.shape, y.shape)

(200, 5) (200,)


In [29]:
# encoding the categorical values

ct = ColumnTransformer([('cat', OneHotEncoder(), ['Sex', 'BP', 'Cholesterol'])], remainder='passthrough')
X = ct.fit_transform(X)

In [31]:
# avoiding the dummy variable trap

for i in [6,4,1]:
    X = np.delete(X, i, 1)

In [32]:
X[:2]

array([[ 1.   ,  1.   ,  0.   ,  1.   , 23.   , 25.355],
       [ 0.   ,  0.   ,  1.   ,  1.   , 47.   , 13.093]])

In [21]:
y = LabelEncoder().fit_transform(y)
y[:10]

array([0, 3, 3, 4, 0, 4, 0, 3, 0, 0])

In [50]:
# splitting the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(y_train.shape, y_test.shape)

(160,) (40,)


In [44]:
# applying the data to the model

clf = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)
print('Training Score: {}'.format(clf.score(X_train, y_train)))
print('Testing Score: {}'.format(clf.score(X_test, y_test)))

Training Score: 1.0
Testing Score: 1.0


In [47]:
# model evaluation

print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

       DrugY       1.00      1.00      1.00        19
       drugA       1.00      1.00      1.00         3
       drugB       1.00      1.00      1.00         1
       drugC       1.00      1.00      1.00         3
       drugX       1.00      1.00      1.00        14

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

