In [43]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


RANDOM_STATE = 55


In [3]:
data = pd.read_csv("drug200.csv")
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
factor = pd.factorize(data['Drug'])
data.Drug = factor[0]
definitions = factor[1]
print(data.Drug.head())
print(definitions)

0    0
1    1
2    1
3    2
4    0
Name: Drug, dtype: int64
Index(['drugY', 'drugC', 'drugX', 'drugA', 'drugB'], dtype='object')


In [6]:
cat_vars = [
    'Sex',
    'BP',
    'Cholesterol'
]

In [9]:
data = pd.get_dummies(data=data, prefix=cat_vars, columns=cat_vars)

In [14]:
features = data.drop('Drug', axis=1).columns.tolist()

In [17]:
x_train, x_val, y_train, y_val = train_test_split(data[features], data['Drug'], train_size=0.8, random_state=RANDOM_STATE)

In [20]:
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

160
160
40
40


In [21]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

In [22]:
cat_parm = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [2, 5, 8, 10, 12],
    'min_samples_split': [2, 5, 10, 15, 20],
}

In [38]:
grid_search = GridSearchCV(model, cat_parm, cv=3, n_jobs=4)
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.9874213836477987


In [39]:
new_model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_split=2, random_state=RANDOM_STATE).fit(x_train, y_train)

In [40]:
acc_train = accuracy_score(new_model.predict(x_train), y_train)
acc_val = accuracy_score(new_model.predict(x_val), y_val)

print(f"Train Accuracy: {acc_train: 0.4f}")
print(f"Train Validation: {acc_val: 0.4f}")

Train Accuracy:  0.9938
Train Validation:  0.9750


In [35]:
data.head()

Unnamed: 0,Age,Na_to_K,Drug,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
0,23,25.355,0,True,False,True,False,False,True,False
1,47,13.093,1,False,True,False,True,False,True,False
2,47,10.114,1,False,True,False,True,False,True,False
3,28,7.798,2,True,False,False,False,True,True,False
4,61,18.043,0,True,False,False,True,False,True,False


In [36]:
x_train.head()

Unnamed: 0,Age,Na_to_K,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
102,28,13.127,True,False,False,True,False,True,False
180,22,22.818,True,False,True,False,False,False,True
115,51,18.295,False,True,True,False,False,True,False
100,31,11.871,False,True,True,False,False,False,True
11,34,19.199,True,False,True,False,False,False,True


In [37]:
new_data = pd.DataFrame([[61, 18.043, 1, 0, 0, 1, 0, 1 ,0]], columns=features)
print(new_model.predict(new_data))

[0]
