In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from keras.models import Sequential
from keras.layers import *
from keras.optimizers.legacy import Adam
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression

sns.set_theme(rc={'figure.figsize':(16,8)})

def plot_confusion_matrix(Y_test, predicted):
    predicted = tf.squeeze(predicted)
    predicted = np.array([1 if x >= 0.5 else 0 for x in predicted])
    actual = np.array(Y_test)
    conf_mat = confusion_matrix(actual, predicted)
    displ = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
    displ.plot()


def my_read_data(scale=False):
    # Reads already cleaned data from .csv, scales if specified and returns X, Y values.

    file_path = Path('adults_data','adult_cleaned_data.csv')
    df = pd.read_csv(file_path)

    X = df.iloc[:,:-1]
    Y = df.iloc[:,-1]

    if scale: pass

    return X, Y

* scaling
* train test split
* oversampling

In [20]:
def main(epochs=200, batch_size=30, scale=True):

    X, Y = my_read_data(scale)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=45)

    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=45)

    # Build
    model = Sequential()
    model.add(Dense(64, "relu", input_shape=(12,)))
    model.add(Dense(16, "relu"))
    model.add(Dense(1, 'sigmoid'))

    # Compile 
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer=Adam())
    model.summary()

    # Fit & Predict
    model_fit = model.fit(X_train, Y_train, validation_split=0.15, batch_size=batch_size, epochs=epochs, verbose=True)    
    predicted = model.predict(X_test)

    # Get accuracy
    train_accuracy = model_fit.history['accuracy']
    val_accuracy = model_fit.history['val_accuracy']

    # Plot accuracy
    epochs = range(1, len(train_accuracy) + 1)
    plt.ylim(0,1)
    plt.plot(epochs, train_accuracy, label="Training set")
    plt.plot(epochs, val_accuracy, label="Validation set")
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

In [29]:
def classification_lr(scale=False, balance=False):
    
    X, Y = my_read_data(scale)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=45)

    if balance: pass

    clf = LogisticRegression().fit(X_train, Y_test)
    y_pred = clf.predict(X_test)
    classif_rep_im = classification_report(Y_test, y_pred)
    print(f'Accuracy imbalanced {accuracy_score(Y_test, y_pred):.2f}')
    print("\nClassification report for imbalanced data\n", classif_rep_im)
    cm_im = ConfusionMatrixDisplay.from_estimator(clf, X_test, Y_test) 
    cm_im.ax_.set_title("Confusion matrix for imbalanced data")
    plt.show()