In [1]:
from main import file_locate
from ml_models import logistic_regression, naive_bayes, k_nearest_neighbors
from ml_models import support_vector, gradient_boost, random_forest
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import recall_score, f1_score
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
def statistics(labels_test, labels_pred):
    cm = confusion_matrix(labels_test, labels_pred)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
    cm_display.plot()
    plt.show()
    accuracy = accuracy_score(labels_test, labels_pred)
    print("Accuracy: ", accuracy)
    recall= recall_score(labels_test, labels_pred)
    print("Recall: ", recall)
    f1 = f1_score(labels_test, labels_pred)
    print("F1: ", f1)
    report = classification_report(labels_test, labels_pred)
    print("Classification Report:")
    print(report)

In [3]:
def auc(labels_test, labels_pred):
    fpr, tpr, thresholds = roc_curve(labels_test, labels_pred)
    auc = roc_auc_score(labels_test, labels_pred)

    plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(auc))
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')
    plt.show()

In [4]:
def precision_recall(labels_test, labels_pred):
    precision, recall, _ = precision_recall_curve(labels_test, labels_pred)
    plt.plot(recall, precision, label='Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.grid(True)
    plt.show()

In [5]:
def analysis(labels_test, labels_pred):
    statistics(labels_test, labels_pred)
    auc(labels_test, labels_pred)
    precision_recall(labels_test, labels_pred)

In [6]:
file_locate('ml-analysis.ipynb')
filename = 'cleaned-framingham.csv'

In [None]:
smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression(random_state=42))
    ])
logistic_regression.hyperparameter_search(filename, smote_scaled)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression(random_state=42))
    ])
logistic_regression.hyperparameter_search(filename, sampled_scaled)

base = Pipeline([
        ('classification', LogisticRegression(max_iter=1000))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = logistic_regression.logistic_regression(filename, base)
analysis(labels_test, labels_pred)

scaled = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = logistic_regression.logistic_regression(filename, scaled)
analysis(labels_test, labels_pred)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = logistic_regression.logistic_regression(filename, sampled_scaled)
analysis(labels_test, labels_pred)

smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = logistic_regression.logistic_regression(filename, smote_scaled)
analysis(labels_test, labels_pred)

hyperparameter_sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression(C=0.1, max_iter=50, penalty='l1', solver='liblinear'))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = logistic_regression.logistic_regression(filename, hyperparameter_sampled_scaled)
analysis(labels_test, labels_pred)

hyperparameter_smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', LogisticRegression(C=0.01, max_iter=50, penalty='l2', solver='liblinear'))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = logistic_regression.logistic_regression(filename, hyperparameter_smote_scaled)
analysis(labels_test, labels_pred)

In [None]:
smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', ComplementNB())
    ])
naive_bayes.hyperparameter_search(filename, smote)

sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', ComplementNB())
    ])
naive_bayes.hyperparameter_search(filename, sampled)

base = Pipeline([
        ('classification', ComplementNB())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = naive_bayes.naive_bayes(filename, base)
analysis(labels_test, labels_pred)

sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', ComplementNB())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = naive_bayes.naive_bayes(filename, sampled)
analysis(labels_test, labels_pred)

smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', ComplementNB())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = naive_bayes.naive_bayes(filename, smote)
analysis(labels_test, labels_pred)

hyperparameter_sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', ComplementNB(alpha=0.5, fit_prior=True, norm=False))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = naive_bayes.naive_bayes(filename, hyperparameter_sampled)
analysis(labels_test, labels_pred)

hyperparameter_smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', ComplementNB(alpha=0.1, fit_prior=True, norm=False))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = naive_bayes.naive_bayes(filename, hyperparameter_smote)
analysis(labels_test, labels_pred)

In [None]:
smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier())
    ])
# k_nearest_neighbors.hyperparameter_search(filename, smote_scaled)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier())
    ])
# k_nearest_neighbors.hyperparameter_search(filename, sampled_scaled)

base = Pipeline([
        ('classification', KNeighborsClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = k_nearest_neighbors.k_nearest(filename, base)
analysis(labels_test, labels_pred)

scaled = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = k_nearest_neighbors.k_nearest(filename, scaled)
analysis(labels_test, labels_pred)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = k_nearest_neighbors.k_nearest(filename, sampled_scaled)
analysis(labels_test, labels_pred)

smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = k_nearest_neighbors.k_nearest(filename, smote_scaled)
analysis(labels_test, labels_pred)

hyperparameter_sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier(algorithm='brute', metric='manhattan', n_neighbors=13, weights='uniform'))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = k_nearest_neighbors.k_nearest(filename, hyperparameter_sampled_scaled)
analysis(labels_test, labels_pred)

hyperparameter_smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', KNeighborsClassifier(algorithm='brute', metric='manhattan', n_neighbors=11, weights='distance'))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = k_nearest_neighbors.k_nearest(filename, hyperparameter_smote_scaled)
analysis(labels_test, labels_pred)

In [None]:
smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', SVC())
    ])
# support_vector.hyperparameter_search(filename, smote_scaled)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', SVC())
    ])
# support_vector.hyperparameter_search(filename, sampled_scaled)

base = Pipeline([
        ('classification', SVC())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = support_vector.support_vector(filename, base)
analysis(labels_test, labels_pred)

scaled = Pipeline([
        ('scaling', StandardScaler()),
        ('classification', SVC())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = support_vector.support_vector(filename, scaled)
analysis(labels_test, labels_pred)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', SVC())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = support_vector.support_vector(filename, sampled_scaled)
analysis(labels_test, labels_pred)

smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', SVC())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = support_vector.support_vector(filename, smote_scaled)
analysis(labels_test, labels_pred)

hyperparameter_sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', SVC(C=1000, class_weight='balanced', gamma='scale', kernel='linear', shrinking=False))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = support_vector.support_vector(filename, hyperparameter_sampled_scaled)
analysis(labels_test, labels_pred)

hyperparameter_smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', SVC(C=0.001, class_weight=None, gamma='scale', kernel='linear', shrinking=False))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = support_vector.support_vector(filename, hyperparameter_smote_scaled)
analysis(labels_test, labels_pred)

In [None]:
smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', xgb.XGBClassifier())
    ])
gradient_boost.hyperparameter_search(filename, smote)

sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', xgb.XGBClassifier())
    ])
gradient_boost.hyperparameter_search(filename, sampled)

base = Pipeline([
        ('classification', xgb.XGBClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = gradient_boost.gradient_boost(filename, base)
analysis(labels_test, labels_pred)

sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', xgb.XGBClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = gradient_boost.gradient_boost(filename, sampled)
analysis(labels_test, labels_pred)

smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', xgb.XGBClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = gradient_boost.gradient_boost(filename, smote)
analysis(labels_test, labels_pred)

hyperparameter_sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=1000))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = gradient_boost.gradient_boost(filename, hyperparameter_sampled)
analysis(labels_test, labels_pred)

hyperparameter_smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=1000))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = gradient_boost.gradient_boost(filename, hyperparameter_smote)
analysis(labels_test, labels_pred)

In [None]:
smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', RandomForestClassifier())
    ])
random_forest.hyperparameter_search(filename, smote)

sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', RandomForestClassifier())
    ])
random_forest.hyperparameter_search(filename, sampled)

base = Pipeline([
        ('classification', RandomForestClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, base)
analysis(labels_test, labels_pred)

sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', RandomForestClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, sampled)
analysis(labels_test, labels_pred)

smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', RandomForestClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, smote)
analysis(labels_test, labels_pred)

sampled_scaled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', RandomForestClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, sampled_scaled)
analysis(labels_test, labels_pred)

smote_scaled = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('scaling', StandardScaler()),
        ('classification', RandomForestClassifier())
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, smote_scaled)
analysis(labels_test, labels_pred)

hyperparameter_sampled = Pipeline([
        ('sampling', RandomOverSampler(random_state=42)),
        ('classification', RandomForestClassifier(criterion='entropy', max_depth=5, max_features='sqrt', n_estimators=200))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, hyperparameter_sampled)
analysis(labels_test, labels_pred)

hyperparameter_smote = Pipeline([
        ('sampling', SMOTE(random_state=42)),
        ('classification', RandomForestClassifier(criterion='entropy', max_depth=5, max_features='sqrt', n_estimators=300))
    ])
labels_train, labels_train_pred, labels_test, labels_pred = random_forest.random_forest(filename, hyperparameter_smote)
analysis(labels_test, labels_pred)