In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score 
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK


In [2]:
ds = pd.read_csv('training.csv.txt')
ds = ds[~ds['Diagnosis'].isin(['DPLD', 'OSA', 'Sarcodiosis','Chest Pain'])]
ds = ds.drop(columns = ['Label'])
ds.dropna(inplace=True)
X_without_diagnosis = ds.drop('Diagnosis', axis = 1)


X = X_without_diagnosis
Y = ds['Diagnosis']


In [5]:
ds["Diagnosis"].value_counts()

Diagnosis
Asthma    580
COPD      415
Name: count, dtype: int64

In [None]:
#X_without_diagnosis.head()
#ds.info()
#print(ds.isnull().sum())
#print(ds)
#ds.head()
ds.shape

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 0)

In [4]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20),
              'min_samples_split': randint(2, 20),
              'min_samples_leaf': randint(1, 20)}

rf = RandomForestClassifier()

rand_search_rf = RandomizedSearchCV(rf, param_dist, n_iter=5, cv=5)
rand_search_rf.fit(X_train_imputed, y_train)

best_rf = rand_search_rf.best_estimator_

In [5]:
imputer = SimpleImputer(strategy='mean')

space = {
    "n_estimators": hp.choice("n_estimators", range(50,500)),
    "max_depth": hp.choice("max_depth", range(1,20)),
    "min_samples_split": hp.choice("min_samples_split", range(2, 20)),
    "min_samples_leaf": hp.choice("min_samples_leaf", range(1, 20))
    }

def hyperparameter_tuning(params):
    rf = RandomForestClassifier(n_estimators=params['n_estimators'],
                                max_depth=params['max_depth'],
                                min_samples_split=params['min_samples_split'],
                                min_samples_leaf=params['min_samples_leaf'],
                                n_jobs=-1)
    acc = cross_val_score(rf, X_train, y_train).mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()

best = fmin (
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

best_hyperparams = {
    "n_estimators": range(50, 501)[best["n_estimators"]],
    "max_depth": range(1, 21)[best["max_depth"]],
    "min_samples_split": range(2, 21)[best["min_samples_split"]],
    "min_samples_leaf": range(1, 21)[best["min_samples_leaf"]]
}

best_rf = RandomForestClassifier(**best_hyperparams, n_jobs=-1)
best_rf.fit(X_train, y_train)

print("Best: {}".format(best))
trials.results
trials.losses()

100%|██████████| 100/100 [18:05<00:00, 10.86s/trial, best loss: -0.7948545861297539]
Best: {'max_depth': 13, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 316}


[-0.7573333333333333,
 -0.7881431767337806,
 -0.776089485458613,
 -0.7733959731543625,
 -0.7827829977628635,
 -0.7827919463087248,
 -0.7720715883668904,
 -0.7694138702460851,
 -0.7828008948545861,
 -0.7854675615212529,
 -0.7787651006711409,
 -0.7787740492170022,
 -0.7720715883668904,
 -0.7827919463087248,
 -0.7747472035794184,
 -0.7693870246085011,
 -0.7801073825503355,
 -0.7747382550335571,
 -0.7653959731543624,
 -0.7801163310961969,
 -0.7747382550335571,
 -0.7841163310961969,
 -0.7640447427293064,
 -0.7908277404921701,
 -0.7868008948545862,
 -0.7747382550335571,
 -0.7894854586129755,
 -0.7707293064876957,
 -0.7694317673378077,
 -0.7908366890380314,
 -0.7868098434004475,
 -0.7787740492170021,
 -0.7492975391498882,
 -0.7600268456375838,
 -0.7881521252796421,
 -0.7720626398210291,
 -0.7693870246085012,
 -0.7693870246085011,
 -0.7787651006711409,
 -0.7734049217002237,
 -0.7841342281879194,
 -0.7854675615212529,
 -0.7774228187919464,
 -0.7787829977628634,
 -0.7680536912751679,
 -0.7653691

In [6]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train_imputed)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(rand_search_rf, X_train_imputed, y_train, cv=k_fold) 

In [7]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(best_rf, X_train, y_train, cv=k_fold) 

In [8]:
y_pred = rand_search_rf.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)

Accuracy: 0.751004016064257
Best hyperparameters: RandomForestClassifier(max_depth=14, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=366, n_jobs=-1)
Precision promedio:  [0.82666667 0.81208054 0.7852349  0.74496644 0.76510067]


In [9]:
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
#print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)

Accuracy: 0.7791164658634538
Precision promedio:  [0.82666667 0.81208054 0.7852349  0.74496644 0.76510067]
