In [1]:
# import the necessery libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#Reading the dataset
df = pd.read_csv('/content/drug200.csv')
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [5]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [6]:
df.shape
#from the shape we can see that there are 200 rows and 6 attributes(columns)
# independent variables are Age(Numerical), Sex(Categorical), BP(Categorical), Cholestrol(Categorical), Na_to_K(NUmerical)
# and the dependent variable to be used for classification Drug(Categorical)

(200, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [8]:
df.isnull().any()
#fromt the output, we can see that there aren't any missing values hence no need to worry about them.

Age            False
Sex            False
BP             False
Cholesterol    False
Na_to_K        False
Drug           False
dtype: bool

In [9]:
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [11]:
df.Drug.unique()
# from the output, we can see that we are dealing with multi-class classification 

array(['DrugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype=object)

In [12]:
df.corr()

  df.corr()


Unnamed: 0,Age,Na_to_K
Age,1.0,-0.063119
Na_to_K,-0.063119,1.0


In [13]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [18]:
# From the data, we can see that the data has 3 categorical independent variables so we will have to perform 
# one-hot encoding for those using LabelEncoder
# Perform preprocessing
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder, StandardScaler
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['BP'] = label_encoder.fit_transform(df['BP'])
df['Cholesterol'] = label_encoder.fit_transform(df['Cholesterol'])
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,DrugY
1,47,1,1,0,13.093,drugC
2,47,1,1,0,10.114,drugC
3,28,0,2,0,7.798,drugX
4,61,0,1,0,18.043,DrugY


In [20]:
# to improve the accuracy of the model, we also perform standard scaling of the numerical variables.
scaler=StandardScaler()
df[['Age', 'Na_to_K']] = scaler.fit_transform(df[['Age', 'Na_to_K']])

In [21]:
#Now, we can split the dataset into x and y containing only independent and dependent variables respectively
x = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']]
x.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,-1.291591,0,0,0,1.286522
1,0.162699,1,1,0,-0.415145
2,0.162699,1,1,0,-0.828558
3,-0.988614,0,2,0,-1.149963
4,1.011034,0,1,0,0.271794


In [22]:
y = df['Drug']
y.head()

0    DrugY
1    drugC
2    drugC
3    drugX
4    DrugY
Name: Drug, dtype: object

In [23]:
# now lets perform train, test and split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=42)

In [24]:
x_train.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
79,-0.746232,0,1,1,-0.727807
197,0.465676,1,2,0,-0.859089
38,-0.322065,0,2,1,-0.884762
24,-0.685637,0,1,0,2.414907
122,-0.625042,1,2,0,0.884211


In [25]:
x_test.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
95,-0.503851,1,1,1,-0.646762
15,-1.715759,0,0,1,-0.078892
30,-1.594568,0,2,1,-1.017848
158,0.889843,0,1,0,-0.782762
128,0.162699,1,1,1,2.422679


In [26]:
x = df.iloc[:,0:5]
y = df.iloc[:,5:]
x.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,-1.291591,0,0,0,1.286522
1,0.162699,1,1,0,-0.415145
2,0.162699,1,1,0,-0.828558
3,-0.988614,0,2,0,-1.149963
4,1.011034,0,1,0,0.271794


In [27]:
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded= label_encoder.transform(y_test)

In [28]:
x_train.shape, x_test.shape, y_train_encoded.shape, y_test_encoded.shape

((160, 5), (40, 5), (160,), (40,))

In [29]:
# Build an ANN model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [31]:
# ANN Model

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(5,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(5, activation='softmax'))

In [32]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [33]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 64)                384       
                                                                 
 dense_5 (Dense)             (None, 64)                4160      
                                                                 
 dense_6 (Dense)             (None, 128)               8320      
                                                                 
 dense_7 (Dense)             (None, 16)                2064      
                                                                 
 dense_8 (Dense)             (None, 5)                 85        
                                                                 
Total params: 15,013
Trainable params: 15,013
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.fit(x_train,y_train_encoded,epochs=10,batch_size=5,validation_data=(x_test,y_test_encoded))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7eff9f579090>

In [35]:
from sklearn.metrics import classification_report, confusion_matrix

In [36]:
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)



In [37]:
print(classification_report(y_test_encoded, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.86      1.00      0.92         6
           2       1.00      1.00      1.00         3
           3       0.00      0.00      0.00         5
           4       0.73      1.00      0.85        11

    accuracy                           0.88        40
   macro avg       0.72      0.80      0.75        40
weighted avg       0.78      0.88      0.82        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
#Now, we generate some random data and check with our model.
rando=np.random.rand(1,5) # we generate 5 random values within the range 0-1 for the 5 label encoded and standardized variables
rando

array([[0.86698187, 0.2687675 , 0.61902174, 0.54500265, 0.473557  ]])

In [39]:
#we will now use this data to make predictions using our model
pred = model.predict(rando)
pred



array([[9.9998689e-01, 1.0818757e-06, 5.5975033e-06, 1.4669893e-06,
        5.0274075e-06]], dtype=float32)

In [43]:
# Inverse transform the predicted labels to get the original class labels
pred_class = label_encoder.inverse_transform(pred.argmax(axis=1)) #the class with the highest value will be our class hence we will use argmax to get the class with the highest probability

# Print the predicted class labels
print(pred_class)

['DrugY']
