In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('drug200.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [4]:
df.columns


Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [5]:
df.Drug.value_counts()

DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

In [6]:
df.Drug.value_counts(normalize=True)*100

DrugY    45.5
drugX    27.0
drugA    11.5
drugC     8.0
drugB     8.0
Name: Drug, dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [8]:
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [10]:
df.shape

(200, 6)

In [11]:
# One Hot Encoding / Dummification
df = pd.get_dummies(df, columns=['Sex', 'BP','Cholesterol','Na_to_K' ], 
                    drop_first=True)

In [12]:
df.head()

Unnamed: 0,Age,Drug,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL,Na_to_K_6.683,Na_to_K_6.769,Na_to_K_7.261,Na_to_K_7.285,...,Na_to_K_31.686,Na_to_K_31.876,Na_to_K_32.922,Na_to_K_33.486,Na_to_K_33.542,Na_to_K_34.686,Na_to_K_34.997,Na_to_K_35.639,Na_to_K_37.188,Na_to_K_38.247
0,23,DrugY,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47,drugC,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47,drugC,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,drugX,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,61,DrugY,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df.iloc[:,1:].head()

Unnamed: 0,Drug,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL,Na_to_K_6.683,Na_to_K_6.769,Na_to_K_7.261,Na_to_K_7.285,Na_to_K_7.298,...,Na_to_K_31.686,Na_to_K_31.876,Na_to_K_32.922,Na_to_K_33.486,Na_to_K_33.542,Na_to_K_34.686,Na_to_K_34.997,Na_to_K_35.639,Na_to_K_37.188,Na_to_K_38.247
0,DrugY,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,drugC,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,drugC,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,drugX,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DrugY,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
X = df.loc[:,df.columns != 'Drug']
X.head()

Unnamed: 0,Age,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL,Na_to_K_6.683,Na_to_K_6.769,Na_to_K_7.261,Na_to_K_7.285,Na_to_K_7.298,...,Na_to_K_31.686,Na_to_K_31.876,Na_to_K_32.922,Na_to_K_33.486,Na_to_K_33.542,Na_to_K_34.686,Na_to_K_34.997,Na_to_K_35.639,Na_to_K_37.188,Na_to_K_38.247
0,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,47,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,61,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
y=df.Drug
y

0      DrugY
1      drugC
2      drugC
3      drugX
4      DrugY
       ...  
195    drugC
196    drugC
197    drugX
198    drugX
199    drugX
Name: Drug, Length: 200, dtype: object

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1)

In [55]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((150, 202), (50, 202), (150,), (50,))

In [56]:
X_train.head()

Unnamed: 0,Age,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL,Na_to_K_6.683,Na_to_K_6.769,Na_to_K_7.261,Na_to_K_7.285,Na_to_K_7.298,...,Na_to_K_31.686,Na_to_K_31.876,Na_to_K_32.922,Na_to_K_33.486,Na_to_K_33.542,Na_to_K_34.686,Na_to_K_34.997,Na_to_K_35.639,Na_to_K_37.188,Na_to_K_38.247
98,20,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
123,36,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
119,61,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,24,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,65,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [57]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[['Age']] = sc.fit_transform(X_train[['Age']])
X_test[['Age']] = sc.transform(X_test[['Age']])

In [58]:
X_train.head()

Unnamed: 0,Age,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL,Na_to_K_6.683,Na_to_K_6.769,Na_to_K_7.261,Na_to_K_7.285,Na_to_K_7.298,...,Na_to_K_31.686,Na_to_K_31.876,Na_to_K_32.922,Na_to_K_33.486,Na_to_K_33.542,Na_to_K_34.686,Na_to_K_34.997,Na_to_K_35.639,Na_to_K_37.188,Na_to_K_38.247
98,-1.419164,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
123,-0.46323,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
119,1.030417,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,-1.18018,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,1.269401,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [59]:
X_test.head()

Unnamed: 0,Age,Sex_M,BP_LOW,BP_NORMAL,Cholesterol_NORMAL,Na_to_K_6.683,Na_to_K_6.769,Na_to_K_7.261,Na_to_K_7.285,Na_to_K_7.298,...,Na_to_K_31.686,Na_to_K_31.876,Na_to_K_32.922,Na_to_K_33.486,Na_to_K_33.542,Na_to_K_34.686,Na_to_K_34.997,Na_to_K_35.639,Na_to_K_37.188,Na_to_K_38.247
58,0.970671,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40,1.747368,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,0.55245,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,-0.941197,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
184,-1.538655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [60]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [61]:
logreg


In [62]:
y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)

In [63]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [64]:
pd.DataFrame(zip(y_train, y_pred_train), columns=['Actual','Predicted'])

Unnamed: 0,Actual,Predicted
0,DrugY,DrugY
1,DrugY,DrugY
2,DrugY,DrugY
3,DrugY,DrugY
4,DrugY,DrugY
...,...,...
145,DrugY,DrugY
146,drugA,drugA
147,drugX,drugX
148,drugA,DrugY


In [65]:
pd.DataFrame(zip(y_test, y_pred_test), columns=['Actual','Predicted'])

Unnamed: 0,Actual,Predicted
0,drugX,drugX
1,DrugY,drugX
2,drugX,DrugY
3,drugC,drugC
4,DrugY,drugA
5,drugX,drugX
6,drugX,DrugY
7,DrugY,DrugY
8,DrugY,DrugY
9,DrugY,DrugY


In [66]:
# Accuracy on train data
accuracy_score(y_train, y_pred_train)

0.9533333333333334

In [67]:
# Accuracy on test data
accuracy_score(y_test, y_pred_test)

0.58