In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## 1. Data Preprocessing

In [2]:
data = pd.read_csv("/content/drug200.csv")
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


#### Continous variables of Age and Na_to_K must be grouped (binned) and then encoded

In [3]:
age_groups = [0, 19, 29, 39, 49, 59, 69, 100]
age_labels = ['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '>70']
data['Age_encoded'] = pd.cut(data['Age'], bins=age_groups, labels=age_labels)
data = data.drop(['Age'], axis=1)

NatoK_groups = [0, 9, 19, 29, 100]
NatoK_labels = ['<10', '10-20', '20-30', '>30']
data['NatoK_encoded'] = pd.cut(data['Na_to_K'], bins=NatoK_groups, labels=NatoK_labels)
data = data.drop(['Na_to_K'], axis=1)

data.head()

Unnamed: 0,Sex,BP,Cholesterol,Drug,Age_encoded,NatoK_encoded
0,F,HIGH,HIGH,DrugY,20-30,20-30
1,M,LOW,HIGH,drugC,40-50,10-20
2,M,LOW,HIGH,drugC,40-50,10-20
3,F,NORMAL,HIGH,drugX,20-30,<10
4,F,LOW,HIGH,DrugY,60-70,10-20


#### One Hot Encode the Xfeatures

In [4]:
features = data.drop(['Drug'], axis=1)
labels   = data['Drug']
# ft_train, ft_test, l_train, l_test = train_test_split(features, labels, test_size=0.25, random_state=0)

ft_enc = pd.get_dummies(features)
ft_enc.head()

Unnamed: 0,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL,Age_encoded_<20,Age_encoded_20-30,Age_encoded_30-40,Age_encoded_40-50,Age_encoded_50-60,Age_encoded_60-70,Age_encoded_>70,NatoK_encoded_<10,NatoK_encoded_10-20,NatoK_encoded_20-30,NatoK_encoded_>30
0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
1,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0
2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0
3,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0


In [5]:
print(f"Num features = {len(ft_enc.columns)}")

Num features = 18


## 2. Model Creation and fit

In [7]:
# 3 hidden layers with 30, 50 and 20 neurons respectively
ann_model = MLPClassifier((30, 50, 20), alpha=1e-5, max_iter=100)
ann_model.fit(ft_enc, labels)



## 3. Testing with random data

In [12]:
rand_data = {
    'Age': [45, 68, 12, 5, 37],
    'Sex': ['M', 'F', 'F', 'M', 'F'],
    'BP':	 ['NORMAL', 'HIGH', 'LOW', 'LOW', 'NORMAL'],
    'Cholesterol': ['NORMAL', 'HIGH', 'HIGH', 'NORMAL', 'HIGH'],
    'Na_to_K': [11.3, 7.55, 25.3, 2.78, 29.45]
}
rand_test = pd.DataFrame(rand_data)
rand_test['Age_encoded']   = pd.cut(rand_test['Age'], bins=age_groups, labels=age_labels)
rand_test['NatoK_encoded'] = pd.cut(rand_test['Na_to_K'], bins=NatoK_groups, labels=NatoK_labels)
rand_test = rand_test.drop(['Age', 'Na_to_K'], axis=1)

rand_test_enc = pd.get_dummies(rand_test)

preds = ann_model.predict(rand_test_enc)
print(f"The predictions for the random test data are: {preds}")

The predictions for the random test data are: ['drugX' 'drugA' 'DrugY' 'drugX' 'DrugY']
