# Data processing
This notebook contain python code to train ML-models and make predictions based on the project data.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import pandas as pd

all_data = pd.read_csv("data/project_train.csv")
y = all_data[["Label"]].to_numpy().ravel()
X = all_data.drop(columns=["Label"]).to_numpy()

# Scale data to make model fitting work in the SKlearn framework
scaler = StandardScaler()
# X = scaler.transform(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)



# Logistic regression
The basis of logistic regression in a binary classification problem such as ours is to use the logistic function $\sigma(\bold{x}) = \frac{1}{1 - e^{-\bold{\beta}\bold{x}}}$. Then the goal is to determine $\bold{\beta}\in \mathbb{R}^{d+1}$ such that

$$
    P(y | \bold{x}) = \frac{1}{1 - e^{-\bold{\beta}  \bold{x}}}
$$

To evaluate the prediction $y_{\text{pred}}$ against the true labels $y_{\text{test}}$.

$$
    \text{accuracy\_test}(y_{\text{test}}, y_{\text{pred}}) = \frac{1}{n_{\text{samples}}} \sum_{i=1}^n 1_{y_i = \hat{y}}
$$

In [2]:
logReg = LogisticRegression()
pl = Pipeline(steps=[
    ('scaler', scaler),
    # ('pca', PCA(n_components=X.shape[1]-3)),
    ('classifier', logReg)
])

logReg_scores = cross_val_score(pl, X, y, cv=5)
print(f"Logistic regression mean accuracy {logReg_scores.mean() * 100:.3}% over 5 fold cross-validation")

Logistic regression mean accuracy 71.3% over 5 fold cross-validation


# Discriminant analysis
In this section we use linear and quadratic dicriminant analysis to classify the data

In [3]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()

lda_pl = Pipeline([
    ('scaler', scaler),
    # ('pca', PCA(n_components=X.shape[1]-3)),
    ('classifier', lda)
])

qda_pl = Pipeline([
    ('scaler', scaler),
    # ('pca', PCA(n_components=X.shape[1]-3)),
    ('classifier', qda)
])

lda_scores = cross_val_score(lda_pl, X, y, cv=5)
qda_scores = cross_val_score(qda_pl, X, y, cv=5)

print(f"LDA mean accuracy {lda_scores.mean() * 100:.3}% over 5 fold cross-validation")
print(f"QDA mean accuracy {qda_scores.mean() * 100:.3}% over 5 fold cross-validation")

LDA mean accuracy 71.7% over 5 fold cross-validation
QDA mean accuracy 67.9% over 5 fold cross-validation


# Support vector Classifier



In [4]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define the SVC with the RBF kernel.
clf = SVC(kernel="rbf")

svm_pl = Pipeline([
    ('scaler', scaler),
    ('classifier', clf)
])

param_grid = {
    'classifier__gamma': ['scale', 'auto', 0.001, 0.005, 0.01, 0.02, 0.03, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]
}

grid_search = GridSearchCV(svm_pl, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print(f"Best gamma: {grid_search.best_params_['classifier__gamma']}")
print(f"Best cross-validated accuracy: {grid_search.best_score_ * 100:.3f}%")

Best gamma: 0.03
Best cross-validated accuracy: 73.663%


# Neural network


In [6]:
from itertools import product
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import layers

def evaluate(epochs, batch_sizes, learning_rates):

    file = open("output.txt", "w")
    sys.stdout = file
    for e, b, l in product(epochs, batch_sizes, learning_rates):
        print("\n--------------------------")
        print(f"epochs: {e}, batch_size: {b}, learning_rate: {l}")

        kf = KFold(n_splits=5, shuffle=True)

        fold = 1
        accuracies = []
        for train_index, test_index in kf.split(X, y):
            print(f"Fold {fold}")
            # Rescale the data
            scaler = StandardScaler()
            x_train = scaler.fit_transform(X[train_index])
            y_train = y[train_index]
            x_test = scaler.fit_transform(X[test_index])
            y_test = y[test_index]

            # build model
            model = Sequential(
                [
                    layers.Dense(16, activation='relu'),
                    layers.Dense(8, activation='relu'),
                    layers.Dense(8, activation='relu'),
                    layers.Dense(8, activation='relu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
            early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
            sgd = SGD(learning_rate = 0.005)
            model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy'])
            model.fit(
                x_train,
                y_train,
                epochs = 200,
                batch_size = 64,
                callbacks=[early_stopping],
                verbose = 0,
            )

            y_pred = (model.predict(x_test, verbose=0) > 0.5).astype(int)
            accuracies.append(accuracy_score(y_test, y_pred))

            print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.3} %")
            fold += 1

        print(f"mean accuracy over kfold: {np.mean(accuracies) * 100:.3f} %")
        print(f"Variance  over kfold: {np.var(accuracies)} ")

    sys.stdout = sys.__stdout__
    file.close()


evaluate(epochs=[300,200,100], batch_sizes=[16,32,64], learning_rates=[0.005, 0.01,0.05])