In [41]:
import pickle
import numpy as np
import pandas as pd
import itertools
from enum import Enum
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier

from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (
    f_classif,
    mutual_info_classif,
    SelectKBest,
    RFE,
    RFECV,
    SelectFpr,
    SelectFdr,
    SelectFwe,
)
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score
import time
RESULTS_COLUMNS = [
    "score",
    "numberOfTruePositives",
    "accuracy",
    "precision",
    "numberOfFeatures",
    "model",
    "model_parameters",
    "feature_selector",
    "selector_parameters",
]

In [62]:
def getScore(y_true, y_pred, featuresUsed):
    """Get score based on y_true, y_pred and number of feature used

    Scoring function, based on which, the best model is selected.
    Score is calculated according to task description: +10 points for each
    correctly classified positive class, -200 points for each feature used
    """

    score = 0

    correct = 0

    for i, y in enumerate(y_true):
        if y == 1 and y_pred[i] == 1:
            correct += 1

    score = 10 * correct - 200 * featuresUsed
    score = 10 * correct
    return correct, score

In [63]:

with open("./Dataset/x_train.txt") as file:
    X_orig_train = [[float(digit) for digit in line.split()] for line in file]

with open("./Dataset/y_train.txt") as file:
    y_orig_train = [[float(digit) for digit in line.split()] for line in file]

with open("./Dataset/x_test.txt") as file:
    X_orig_test = [[float(digit) for digit in line.split()] for line in file]

In [64]:
print("X_test datapoints:", len(X_orig_test), " features:", len(X_orig_test[0]))
print("X_train datapoints:", len(X_orig_train), " features:", len(X_orig_train[0]))
print("y_train datapoints:", len(y_orig_train))

X_test datapoints: 5000  features: 500
X_train datapoints: 5000  features: 500
y_train datapoints: 5000


In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    X_orig_train, y_orig_train, test_size=0.3, random_state=42
)
y_train_ravel = np.ravel(y_train, order="C")
y_train_ravel = y_train_ravel.astype(int)
y_test_ravel = np.ravel(y_test, order="C")
y_test_ravel = y_test_ravel.astype(int)

In [71]:
# score_func = [f_classif, mutual_info_classif]

kbest = SelectKBest(f_classif, k=3)

kbest.fit(X_train, y_train_ravel)   

X_train_kbest = kbest.transform(X_train)
X_test_kbest = kbest.transform(X_test)

In [78]:
model = MLPClassifier(hidden_layer_sizes=(200), solver='sgd')

model.fit(X_train_kbest, y_train_ravel)

y_pred = model.predict(X_test_kbest)

numberOfFeatures = len(X_train[0])

accuracy = accuracy_score(y_test_ravel, y_pred)

correct, score = getScore(y_test_ravel, y_pred, numberOfFeatures)

In [79]:
correct, score, accuracy

(391, 3910, 0.48733333333333334)

In [80]:
print(y_test_ravel[y_test_ravel == 1].shape, y_pred[y_pred == 1].shape)
print(y_test_ravel[y_test_ravel == 0].shape, y_pred[y_pred == 0].shape)

(746,) (805,)
(754,) (695,)
