In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score #confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score 
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  


In [2]:
url = "http://dibresources.jcbose.ac.in/ssaha4/pulmopred/public/training.csv.txt"
ds = pd.read_csv(url)
ds = ds[~ds['Diagnosis'].isin(['DPLD', 'OSA', 'Sarcodiosis','Chest Pain'])]
ds = ds.drop(columns = ['Label'])
X_without_diagnosis = ds.drop('Diagnosis', axis = 1)

X = X_without_diagnosis
Y = ds['Diagnosis']


In [3]:
#X_without_diagnosis.head()
#ds.info()
print(ds.isnull().sum())

FEV1 Pre-BD Value     0
FEV1 Pre-BD Pred      1
FEV1 Post-BD Value    3
FEV1 Post-BD Pred     2
FVC Pre-BD Value      1
FVC Pre-BD Pred       1
FVC Post-BD Value     1
FVC Post-BD Pred      3
FEF Pre-BD Value      3
FEF Pre-BD Pred       5
FEF Post-BD Value     4
FEF Post-BD Pred      5
Diagnosis             0
dtype: int64


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 0)

In [5]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20),
              'min_samples_split': randint(2, 20),
              'min_samples_leaf': randint(1, 20)}

rf = RandomForestClassifier()
hgbc = HistGradientBoostingClassifier()

rand_search_rf = RandomizedSearchCV(rf, param_dist, n_iter=5, cv=5)
rand_search_rf.fit(X_train_imputed, y_train)

best_rf = rand_search_rf.best_estimator_

In [6]:
scaler = StandardScaler()
X_scaler = imputer.fit_transform(X_train_imputed)

k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(rand_search_rf, X_train_imputed, y_train, cv=k_fold) 

In [8]:
y_pred = rand_search_rf.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('Best hyperparameters:', best_rf)
print("Precision promedio: ", scores)

Accuracy: 0.7847682119205298
Best hyperparameters: RandomForestClassifier(max_depth=11, min_samples_leaf=9, min_samples_split=15,
                       n_estimators=83)
Precision promedio:  [0.80141844 0.73758865 0.68085106 0.80851064 0.77857143]
