In [43]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


RANDOM_STATE = 55


In [3]:
data = pd.read_csv("drug200.csv")
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
factor = pd.factorize(data['Drug'])
data.Drug = factor[0]
definitions = factor[1]
print(data.Drug.head())
print(definitions)

0    0
1    1
2    1
3    2
4    0
Name: Drug, dtype: int64
Index(['drugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype='object')


In [6]:
cat_vars = [
    'Sex',
    'BP',
    'Cholesterol'
]

In [9]:
data = pd.get_dummies(data=data, prefix=cat_vars, columns=cat_vars)

In [14]:
features = data.drop('Drug', axis=1).columns.tolist()

In [17]:
x_train, x_val, y_train, y_val = train_test_split(data[features], data['Drug'], train_size=0.8, random_state=RANDOM_STATE)

In [20]:
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

160
160
40
40


In [21]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

In [22]:
cat_parm = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [2, 5, 8, 10, 12],
    'min_samples_split': [2, 5, 10, 15, 20],
}

In [38]:
grid_search = GridSearchCV(model, cat_parm, cv=3, n_jobs=4)
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.9874213836477987


In [39]:
new_model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_split=2, random_state=RANDOM_STATE).fit(x_train, y_train)

In [40]:
acc_train = accuracy_score(new_model.predict(x_train), y_train)
acc_val = accuracy_score(new_model.predict(x_val), y_val)

print(f"Train Accuracy: {acc_train: 0.4f}")
print(f"Train Validation: {acc_val: 0.4f}")

Train Accuracy:  0.9938
Train Validation:  0.9750


In [35]:
data.head()

Unnamed: 0,Age,Na_to_K,Drug,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
0,23,25.355,0,True,False,True,False,False,True,False
1,47,13.093,1,False,True,False,True,False,True,False
2,47,10.114,1,False,True,False,True,False,True,False
3,28,7.798,2,True,False,False,False,True,True,False
4,61,18.043,0,True,False,False,True,False,True,False


In [36]:
x_train.head()

Unnamed: 0,Age,Na_to_K,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
102,28,13.127,True,False,False,True,False,True,False
180,22,22.818,True,False,True,False,False,False,True
115,51,18.295,False,True,True,False,False,True,False
100,31,11.871,False,True,True,False,False,False,True
11,34,19.199,True,False,True,False,False,False,True


In [37]:
new_data = pd.DataFrame([[61, 18.043, 1, 0, 0, 1, 0, 1 ,0]], columns=features)
print(new_model.predict(new_data))

[0]


# Using XGBoost 

In [44]:
param = {
    'n_estimators': [50, 100, 200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6, 7]
}

In [46]:
xg_model = XGBClassifier(random_state = RANDOM_STATE, verbosity=1)
grid = GridSearchCV(xg_model, param, cv = 3, n_jobs = 4)
grid.fit(x_train, y_train, eval_set = [(x_val, y_val)])

print("Best parameters found: ", grid.best_params_)
print("Best score found: ", grid.best_score_)

[0]	validation_0-mlogloss:1.59145
[0]	validation_0-mlogloss:1.59098
[0]	validation_0-mlogloss:1.59040
[1]	validation_0-mlogloss:1.57292
[1]	validation_0-mlogloss:1.57322
[1]	validation_0-mlogloss:1.57156
[2]	validation_0-mlogloss:1.55523
[2]	validation_0-mlogloss:1.55537
[2]	validation_0-mlogloss:1.55316
[3]	validation_0-mlogloss:1.53847[3]	validation_0-mlogloss:1.53808

[3]	validation_0-mlogloss:1.53508
[4]	validation_0-mlogloss:1.52111
[4]	validation_0-mlogloss:1.52131
[4]	validation_0-mlogloss:1.51740
[5]	validation_0-mlogloss:1.50509
[5]	validation_0-mlogloss:1.50464
[5]	validation_0-mlogloss:1.50001
[6]	validation_0-mlogloss:1.48799
[6]	validation_0-mlogloss:1.48847
[6]	validation_0-mlogloss:1.48301
[7]	validation_0-mlogloss:1.47290
[7]	validation_0-mlogloss:1.47149
[7]	validation_0-mlogloss:1.46628
[0]	validation_0-mlogloss:1.59145
[8]	validation_0-mlogloss:1.45703
[8]	validation_0-mlogloss:1.45547
[8]	validation_0-mlogloss:1.44973
[9]	validation_0-mlogloss:1.44191
[1]	validation

In [50]:
new_xg_model = XGBClassifier(learning_rate=0.05, n_estimators=100, max_depth=3, random_state = RANDOM_STATE, early_stopping_rounds=10)
new_xg_model.fit(x_train, y_train, eval_set=[(x_val, y_val)])
accu_train = accuracy_score(new_xg_model.predict(x_train), y_train)
accu_val = accuracy_score(new_xg_model.predict(x_val), y_val)

print(f"Accuracy on training set: {accu_train: 0.4f}")
print(f"Accuracy on validation set: {accu_val: 0.4f}")

[0]	validation_0-mlogloss:1.51149


[1]	validation_0-mlogloss:1.42294
[2]	validation_0-mlogloss:1.34199
[3]	validation_0-mlogloss:1.26877
[4]	validation_0-mlogloss:1.20156
[5]	validation_0-mlogloss:1.13999
[6]	validation_0-mlogloss:1.08186
[7]	validation_0-mlogloss:1.02942
[8]	validation_0-mlogloss:0.97911
[9]	validation_0-mlogloss:0.93308
[10]	validation_0-mlogloss:0.88906
[11]	validation_0-mlogloss:0.84913
[12]	validation_0-mlogloss:0.81067
[13]	validation_0-mlogloss:0.77495
[14]	validation_0-mlogloss:0.74049
[15]	validation_0-mlogloss:0.70918
[16]	validation_0-mlogloss:0.67873
[17]	validation_0-mlogloss:0.64996
[18]	validation_0-mlogloss:0.62313
[19]	validation_0-mlogloss:0.59765
[20]	validation_0-mlogloss:0.57283
[21]	validation_0-mlogloss:0.54991
[22]	validation_0-mlogloss:0.52869
[23]	validation_0-mlogloss:0.50749
[24]	validation_0-mlogloss:0.48795
[25]	validation_0-mlogloss:0.46737
[26]	validation_0-mlogloss:0.44741
[27]	validation_0-mlogloss:0.42877
[28]	validation_0-mlogloss:0.41068
[29]	validation_0-mlogloss:0.