In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score 
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  
from sklearn.pipeline import Pipeline


In [29]:
url = "http://dibresources.jcbose.ac.in/ssaha4/pulmopred/public/training.csv.txt"
ds = pd.read_csv(url)
ds = ds[~ds['Diagnosis'].isin(['DPLD', 'OSA', 'Sarcodiosis','Chest Pain'])]
ds = ds.drop(columns = ['Label'])
ds.dropna(inplace=True)
X_without_diagnosis = ds.drop('Diagnosis', axis = 1)


X = X_without_diagnosis
Y = ds['Diagnosis']


In [49]:
#X_without_diagnosis.head()
#ds.info()
#print(ds.isnull().sum())
#print(ds)
#ds.head()
ds.shape

(995, 13)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state = 0)

In [42]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20),
              'min_samples_split': randint(2, 20),
              'min_samples_leaf': randint(1, 20)}

rf = RandomForestClassifier()

rand_search_rf = RandomizedSearchCV(rf, param_dist, n_iter=5, cv=5)
rand_search_rf.fit(X_train_imputed, y_train)

best_rf = rand_search_rf.best_estimator_

In [45]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train_imputed)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(rand_search_rf, X_train_imputed, y_train, cv=k_fold) 

In [46]:
model = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),             
        ('scaler', StandardScaler()),                          
        ('rand_search_rf', RandomizedSearchCV(                  
        RandomForestClassifier(),
        param_dist,
        n_iter=5,
        cv=5))
])

In [47]:
y_pred = rand_search_rf.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)

Accuracy: 0.7630522088353414
Best hyperparameters: RandomForestClassifier(max_depth=7, min_samples_split=13, n_estimators=111)
Precision promedio:  [0.82666667 0.77852349 0.80536913 0.73825503 0.75167785]
