In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import get_scorer_names
import time
import random

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [26]:
df = pd.read_csv('dataset/heart_v2.csv')
df.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


In [27]:
X = df.drop('heart disease',axis=1)
y = df['heart disease']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

((189, 4), (81, 4))

## ```Normal Classification```

In [29]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()#max_depth=3, criterion='log_loss', max_features='sqrt') # criterion = 'gini', max_depth=11, min_samples_split=2)
model3 = KNeighborsClassifier()
model4 = GaussianNB()
model5 = SVC()

In [30]:
model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)
model4.fit(X_train,y_train)
model5.fit(X_train,y_train)

In [33]:
print('LogisticRegression :\t\t',model1.score(X_test, y_test)) # LogisticRegression
print('DecisionTreeClassifier :\t',model2.score(X_test, y_test)) # DecisionTreeClassifier
print('KNeighborsClassifier :\t\t',model3.score(X_test, y_test)) # KNeighborsClassifier
print('GaussianNB :\t\t\t',model4.score(X_test, y_test)) # GaussianNB
print('SVM :\t\t\t\t',model5.score(X_test, y_test)) # SVM 

LogisticRegression :		 0.654320987654321
DecisionTreeClassifier :	 0.6790123456790124
KNeighborsClassifier :		 0.654320987654321
GaussianNB :			 0.6666666666666666
SVM :				 0.6790123456790124


In [9]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True)
classifier_rf.fit(X_train, y_train)

In [10]:
print("oob score:\t\t", classifier_rf.oob_score_) # checking the oob score
print("classifier score:\t", classifier_rf.score(X_test, y_test)) # checking the model score

oob score:		 0.6613756613756614
classifier score:	 0.6296296296296297


## ```Grid Search for Parameter Finetuning```

In [11]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [12]:
# Creating a dictionary of parameteres with their values being in lists
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

combos = len(params['max_depth'])*len(params['min_samples_leaf'])*len(params['n_estimators'])
combos

180

In [13]:
get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [14]:
cv_cnt = random.randint(0,10)
begin = time.time()
grid_search = GridSearchCV(estimator= rf , param_grid=params, cv = 10, n_jobs=-1, verbose=1, scoring="f1")
grid_search.fit(X_train, y_train)
end = time.time()
t = end - begin

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


In [15]:
grid_search.best_params_

{'max_depth': 3, 'min_samples_leaf': 5, 'n_estimators': 30}

In [16]:
grid_search.best_score_

0.6595098039215685

In [17]:
# grid_search.cv_results_

##### getting oob score

In [18]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=3, min_samples_leaf=10, n_estimators=10, oob_score=True)
classifier_rf.fit(X_train, y_train)

In [19]:
print("oob score:\t\t", classifier_rf.oob_score_) # checking the oob score
print("classifier score:\t", classifier_rf.score(X_test, y_test)) # checking the model score
print("time taken:\t", t) # checking the model score

oob score:		 0.671957671957672
classifier score:	 0.6296296296296297
time taken:	 80.24937343597412


In [20]:
row_contents = [combos, cv_cnt ,classifier_rf.oob_score_, classifier_rf.score(X_test, y_test), t]
row_contents

[180, 9, 0.671957671957672, 0.6296296296296297, 80.24937343597412]

### Get data

In [21]:
from csv import writer

def append_list_as_row(file_name, list_of_elem):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        csv_writer = writer(write_obj)
        # Add contents of list as last row in the csv file
        csv_writer.writerow(list_of_elem)

In [22]:
# Append a list as new line to an old csv file
append_list_as_row('Report/initialReport_2022-12-28.csv', row_contents)

In [23]:
report = pd.read_csv('Report/initialReport_2022-12-28.csv')
report

Unnamed: 0,combos,cv_cnt,oob_score,classifier_score,time-taken
0,180,10,0.671958,0.62963,63.822724
1,180,5,0.671958,0.62963,69.873795
2,180,6,0.671958,0.62963,62.151759
3,180,1,0.671958,0.62963,80.85548
4,180,10,0.671958,0.62963,79.011138
5,180,0,0.671958,0.62963,63.78821
6,180,1,0.671958,0.62963,58.28404
7,180,6,0.671958,0.62963,63.967118
8,180,9,0.671958,0.62963,75.165589
9,180,9,0.671958,0.62963,80.249373
