In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score 
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK


In [10]:
url = "http://dibresources.jcbose.ac.in/ssaha4/pulmopred/public/training.csv.txt"
ds = pd.read_csv(url)
ds = ds[~ds['Diagnosis'].isin(['DPLD', 'OSA', 'Sarcodiosis','Chest Pain'])]
ds = ds.drop(columns = ['Label'])
ds.dropna(inplace=True)
X_without_diagnosis = ds.drop('Diagnosis', axis = 1)


X = X_without_diagnosis
Y = ds['Diagnosis']


In [None]:
ds["Diagnosis"].value_counts()

In [None]:
#X_without_diagnosis.head()
#ds.info()
#print(ds.isnull().sum())
#print(ds)
#ds.head()
ds.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 0)

In [11]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20),
              'min_samples_split': randint(2, 20),
              'min_samples_leaf': randint(1, 20)}

rf = RandomForestClassifier()

rand_search_rf = RandomizedSearchCV(rf, param_dist, n_iter=5, cv=5)
rand_search_rf.fit(X_train_imputed, y_train)

best_rf = rand_search_rf.best_estimator_

In [None]:
imputer = SimpleImputer(strategy='mean')

space = {
    "n_estimators": hp.choice("n_estimators", range(50,500)),
    "max_depth": hp.choice("max_depth", range(1,20)),
    "min_samples_split": hp.choice("min_samples_split", range(2, 20)),
    "min_samples_leaf": hp.choice("min_samples_leaf", range(1, 20))
    }

def hyperparameter_tuning(params):
    rf = RandomForestClassifier(n_estimators=params['n_estimators'],
                                max_depth=params['max_depth'],
                                min_samples_split=params['min_samples_split'],
                                min_samples_leaf=params['min_samples_leaf'],
                                n_jobs=-1)
    acc = cross_val_score(rf, X_train, y_train).mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()

best = fmin (
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

best_hyperparams = {
    "n_estimators": range(50, 501)[best["n_estimators"]],
    "max_depth": range(1, 21)[best["max_depth"]],
    "min_samples_split": range(2, 21)[best["min_samples_split"]],
    "min_samples_leaf": range(1, 21)[best["min_samples_leaf"]]
}

best_rf = RandomForestClassifier(**best_hyperparams, n_jobs=-1)
best_rf.fit(X_train, y_train)

print("Best: {}".format(best))
trials.results
trials.losses()

In [12]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train_imputed)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(rand_search_rf, X_train_imputed, y_train, cv=k_fold) 

In [None]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(best_rf, X_train, y_train, cv=k_fold) 

In [None]:
y_pred = rand_search_rf.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)

In [None]:
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
#print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)