# Drugs Classifier using Decision Tree

Here [Drug Classification](https://www.kaggle.com/prathamtripathi/drug-classification) dataset by [Pratham Tripathi](https://www.kaggle.com/prathamtripathi) is used to create a classifier that classifies `drugs` on the basis of it `properites` using `Decision Tree`.

![](https://media.giphy.com/media/xT8qB2zDVGj7ly4moU/giphy.gif)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score

# Models
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

from joblib import dump

In [None]:
df = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
df.sample(5)

## 🏋️‍♀️ Data preparation

Looing if the dataset is `balanced` or not.

In [None]:
print(df.Drug.value_counts())
sns.countplot(x='Drug', data=df)

In [None]:
df.info()

`Label encoding vs OneHot encoding` 👉 [Source_1](https://towardsdatascience.com/choosing-the-right-encoding-method-label-vs-onehot-encoder-a4434493149b) and [Source_2](https://datascience.stackexchange.com/questions/9443/when-to-use-one-hot-encoding-vs-labelencoder-vs-dictvectorizor)

In [None]:
df.head()

In [None]:
def data_encoding(df):
    df.Sex = LabelEncoder().fit_transform(df.Sex)
    df.BP = LabelEncoder().fit_transform(df.BP)
    df.Cholesterol = LabelEncoder().fit_transform(df.Cholesterol)


# data_encoding(df)
# df.head()

This will be done after `EDA` so that we can get insight into data & don't need to worry about mapping fig plot's x & y labels to original values

- Sex
    - Female - 0
    - Male - 1
- BP
    - HIGH - 0
    - LOW - 1
    - Normal - 2
- Cholesterol
    - HIGH - 0
    - LOW - 1
    
Since we are using `DecisionTreeClassifier` algorithm for classification, `LabelEncoding` is ok, otherwise if we are using something else where numbers matter, there we should use `OneHotEncoding`.

## 🍩 Exploratory Data Analysis

In [None]:
''' Helper functions for plotting '''


def plot_histplot(column, ax=None):
    sns.histplot(x=column, color='#65b87b', alpha=.7, ax=ax)
    
    
def plot_countplot(column, ax=None):
    with sns.axes_style('ticks'):
        sns.countplot(x=column, palette=sns.color_palette('rocket'), ax=ax)
        sns.despine(offset=6)
        
        
def plot_barplot(x, y, ax=None):
    sns.barplot(x=x, y=y, palette=sns.color_palette('rocket'))
    
    
def plot_boxplot(x, y, ax=None):
    sns.boxplot(x=x, y=y)

In [None]:
plot_histplot(df.Age)

In [None]:
plot_histplot(df.Na_to_K)

In [None]:
f, ax = plt.subplots(1, 2, figsize=(16, 4))

plot_countplot(df[df.Sex == 'M'].BP, ax=ax[0])
plot_countplot(df[df.Sex == 'F'].BP, ax=ax[1])

ax[0].set_title('Male - BP')
ax[1].set_title('Female - BP')

Less number of `males` have `normal BP` compared to `females`. Large proportion of both the genders have a `high BP`

In [None]:
f, ax = plt.subplots(1, 2, figsize=(16, 4))

plot_countplot(df[df.Sex == 'M'].Cholesterol, ax=ax[0])
plot_countplot(df[df.Sex == 'F'].Cholesterol, ax=ax[1])

ax[0].set_title('Male - Cholesterol')
ax[1].set_title('Female - Cholesterol')

Both `male` & `female` have `high cholesterol`

In [None]:
sns.regplot(x=df.Age, y=df.Na_to_K)

`Age` is not correlated to `Na_to_K`

In [None]:
plot_boxplot(df.Cholesterol, df.Na_to_K)

In [None]:
plot_boxplot(df.BP, y=df.Na_to_K)

In [None]:
plot_countplot(df.Drug)

In [None]:
plot_boxplot(df.Drug, df.Age)

> `drugB` is majorly consumed by people whose age is greater than 60 while other durgs are majorly consumed by people whose age is lesser than 60.
>
> `DrugY` is consumed more than other drugs while `drugB` and `drugA` are consumed by less number of people

### Data preparation: Encoding

In [None]:
data_encoding(df)
df.head()

- Sex
    - Female - 0
    - Male - 1
- BP
    - HIGH - 0
    - LOW - 1
    - NORMAL - 2
- Cholesterol
    - HIGH - 0
    - NORMAL - 1
    
Since we are using `DecisionTreeClassifier` algorithm for classification, `LabelEncoding` is ok, otherwise if we are using something else where numbers matter, there we should use `OneHotEncoding`.

## 🍀 Modelling

Let's create our `AI`.

![](https://media.giphy.com/media/xT0xepagSrUXfM1eNi/giphy.gif)

In [None]:
x = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
y = df.Drug.values

# Scaling x
x = StandardScaler().fit_transform(x)

print(f'Dataset size: {len(x)}')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=3
)

print(f'Training set size: {len(x_train)}')
print(f'Test set size: {len(x_test)}')

In [None]:
# For cross validation
skf = StratifiedKFold(n_splits=10)

In [None]:
models = [
    LogisticRegression(), 
    SGDClassifier(), 
    KNeighborsClassifier(), 
    GaussianNB(), 
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
]

In [None]:
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=skf)
    print(f'== {model} ==')
    print(f'Cross-Validation mean-score: {scores.mean()}')
    
    print()

In [None]:
# Parameter tuning

def dt_param_selection(x, y, nfolds):
    criterion = ['gini', 'entropy']
    splitter = ['best', 'random']
    max_depth = [1, 2, 3, 4, 5]

    param_grid = {
        'criterion': criterion, 
        'splitter': splitter, 
        'max_depth': max_depth
    }

    grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=nfolds)
    grid_search.fit(x, y)
    return grid_search.best_params_


best_params_ = dt_param_selection(x_train, y_train, skf)
best_params_

In [None]:
# Cross Validation

model = DecisionTreeClassifier(criterion='gini', max_depth=4, splitter='best')
scores = cross_val_score(model, x_train, y_train, cv=skf)
print(scores.mean())

In [None]:
model = DecisionTreeClassifier(criterion='gini', max_depth=4, splitter='best')
model.fit(x_train, y_train)

## 🦋 Evaluation

In [None]:
y_test_pred = model.predict(x_test)

print(f"Prediction: \n{pd.DataFrame(y_test_pred)[0].value_counts()}")

In [None]:
print(f"Actual: \n{pd.DataFrame(y_test).value_counts()}")

In [None]:
print(f'Model Score: {model.score(x_test, y_test)}')
print(f'f1-score: {f1_score(y_test, y_test_pred, average="weighted")}')
print(f'precision score: {precision_score(y_test, y_test_pred, average="weighted")}')
print(f'recall score: {recall_score(y_test, y_test_pred, average="weighted")}')

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
# Saving the model
dump(model, 'model.joblib')

### 🐚 Visualization

In [None]:
!pip install pydotplus

In [None]:
from io import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline

In [None]:
dot_data = StringIO()

filename = "drugtree.png"
featureNames = df.columns[0:5]
targetNames = df["Drug"].unique().tolist()

out=tree.export_graphviz(model,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_train), filled=True,  special_characters=True,rotate=False)  

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

---

I'll wrap things up there. If you want to find some other answers then go ahead `edit` this kernel. If you have any `questions` then do let me know.

If this kernel helped you then don't forget to 🔼 `upvote` and share your 🎙 `feedback` on improvements of the kernel.

![](https://media.giphy.com/media/Md9UQRsv94yCAjeA1w/giphy.gif)

---