In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score 
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK


In [5]:
url = "http://dibresources.jcbose.ac.in/ssaha4/pulmopred/public/training.csv.txt"
ds = pd.read_csv(url)
ds = ds[~ds['Diagnosis'].isin(['DPLD', 'OSA', 'Sarcodiosis','Chest Pain'])]
ds = ds.drop(columns = ['Label'])
ds.dropna(inplace=True)
X_without_diagnosis = ds.drop('Diagnosis', axis = 1)


X = X_without_diagnosis
Y = ds['Diagnosis']


In [6]:
#X_without_diagnosis.head()
#ds.info()
#print(ds.isnull().sum())
#print(ds)
#ds.head()
ds.shape

(995, 13)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 0)

In [None]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20),
              'min_samples_split': randint(2, 20),
              'min_samples_leaf': randint(1, 20)}

rf = RandomForestClassifier()

rand_search_rf = RandomizedSearchCV(rf, param_dist, n_iter=5, cv=5)
rand_search_rf.fit(X_train_imputed, y_train)

best_rf = rand_search_rf.best_estimator_

In [12]:
rf = RandomForestClassifier()

space = {
    "n_estimators": hp.randint("n_estimators", [50,500]),
    "max_depth": hp.randint("max_depth", 1,20),
    'min_samples_split': hp.randint('min_samples_split', 2, 20),
    # 'min_samples_split': hp.randint('min_samples_split', 2, 20),
    'min_samples_leaf': hp.randint('min_samples_leaf', 1, 20)
    }

def hyperparameter_tuning(params):
    rf = RandomForestClassifier(**params, n_jobs=-1)
    acc = cross_val_score(rf, X_train, y_train).mean()
    return {"loss": -acc, "status": STATUS_OK}

trials = Trials()

best = fmin (
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

print("Best: {}".format(best))
trials.results
trials.losses()

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

 20%|██        | 20/100 [00:23<01:35,  1.19s/trial, best loss: -0.7828008948545863]


TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train_imputed)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(rand_search_rf, X_train_imputed, y_train, cv=k_fold) 

In [None]:
# model = Pipeline([
#         ('imputer', SimpleImputer(strategy='mean')),             
#         ('scaler', StandardScaler()),                          
#         ('rand_search_rf', RandomizedSearchCV(                  
#         RandomForestClassifier(),
#         param_dist,
#         n_iter=5,
#         cv=5))
# ])
model = Pipeline([('imputer', SimpleImputer(strategy='mean')),             
        ('scaler', StandardScaler()),                          
        ('rand_search_rf', RandomizedSearchCV(                  
        RandomForestClassifier(),
        param_dist,
        n_iter=5,
        cv=5))])
# model.train(X_test_imputed, y_train)
print(model.score())
# def objective_fn_for_ann_hyperopt(params, nfolds=k_fold):
#         try:
#                model = Pipeline([
#         ('imputer', SimpleImputer(strategy='mean')),             
#         ('scaler', StandardScaler()),                          
#         ('rand_search_rf', RandomizedSearchCV(                  
#         RandomForestClassifier(),
#         param_dist,
#         n_iter=5,
#         cv=5))])
#                model.train(X_test_imputed, y_test)
#                loss = model.mse()
#               #  loss = model._model_json['output']['cross_validation_metrics_summary'].as_data_frame(
#        #      ).iloc[5]['mean']
#                success = STATUS_OK #'ok'

#         except:
#                success = STATUS_FAIL  #'fail'              
#                loss = 0 #arbitrary number


#         return {'loss': loss, 'params': params,'status': success}
# trials = Trials()
# best = fmin(fn=objective_fn_for_ann_hyperopt, space=hp.uniform("x", -10, 10), algo=tpe.suggest, trials=trials, max_evals=100)
# print(best)
# best = fmin(fn=lambda x: -(x**2), space=hp.uniform('x', -10, 10), algo=tpe.suggest, trials=trials, max_evals=100)

In [None]:
y_pred = rand_search_rf.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)