# Imports And Consts

In [None]:
RESULTS_COLUMNS = [
    "score",
    "numberOfTruePositives",
    "accuracy",
    "precision",
    "numberOfFeatures",
    "model",
    "model_parameters",
    "feature_selector",
    "selector_parameters",
]

In [None]:
import functions
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
from sklearn.feature_selection import f_classif, mutual_info_classif
import matplotlib.pyplot as plt
from sklearn.svm import SVC

# Load Data

load the training and test data in a format specified by task description

In [None]:
with open("./Dataset/x_test.txt") as file:
    X_test = [[float(digit) for digit in line.split()] for line in file]


with open("./Dataset/x_train.txt") as file:
    X_train = [[float(digit) for digit in line.split()] for line in file]


with open("./Dataset/y_train.txt") as file:
    y_train = [[float(digit) for digit in line.split()] for line in file]

In the X_test there are 500 features and 5000 observations, y_train contains 5000 values, X_train contains 500 features and 5000 observations

In [None]:
print("X_test datapoints:", len(X_test), " features:", len(X_test[0]))
print("X_train datapoints:", len(X_train), " features:", len(X_train[0]))
print("y_train datapoints:", len(y_train))

X_test datapoints: 5000  features: 500
X_train datapoints: 5000  features: 500
y_train datapoints: 5000


In [None]:
# Batch 16
## Consts
RESULTS_FILENAME = "./Results/Limited/results-16"
## Experiment
### Parameters
# HistGradientBoostingClassifier
learning_rate = [0.1, 0.01]
max_depth = [2, 3, 5, None]
random_state = [42]
max_features = [3, 5, 8, 1.0]
l2_regularization = [0, 0.5, 0.8]
histGradientBoostingParameters = functions.generateParameters(
    [learning_rate, max_depth, random_state, max_features, l2_regularization], globals()
)

# LDA
solver = ["svd", "lsqr"]
shrinkage = [None, "auto"]
n_components = [3, 5, 8, None]
ldaParameters = functions.generateParameters(
    [solver, shrinkage, n_components], globals()
)


# QDA
reg_param = [0, 0.3, 0.5, 0.8]
qdaParameters = functions.generateParameters([reg_param], globals())

# KNN
n_neighbors = [3, 5, 8]
weights = ["uniform", "distance"]
p = [1, 2]
leaf_size = [15, 30, 50]
knnParameters = functions.generateParameters(
    [n_neighbors, weights, p, leaf_size], globals()
)

# SVM
C = [0.5, 1.0, 1.5]
kernel = ["linear", "rbf", "sigmoid", "poly"]
svcParameters = functions.generateParameters([C, kernel], globals())

# GradientBoostingClassifier
loss = ["exponential"]
learning_rate = [ 0.001]
n_estimators = [800]
min_samples_split = [2]
min_samples_leaf = [5, 7]
subsample = [1.0]
max_depth = [5, 8 ]
min_impurity_decrease = [1]
random_state = [42]
max_features = [None]
ccp_alpha = [0]
gradientBoostingParameters = functions.generateParameters(
    [
        loss,
        learning_rate,
        n_estimators,
        subsample,
        max_depth,
        random_state,
        max_features,
        ccp_alpha,
        min_samples_split,
        min_samples_leaf,
        min_impurity_decrease,
    ],
    globals(),
)


# MLPClassifier

activation = ["relu", "tanh"]
solver = [ "adam"]
alpha = [ 0.01]
learning_rate = [ "adaptive"]
learning_rate_init = [0.01, 0.001]
hidden_layer_sizes = [(100,), (50,), (20,)]
max_iter = [1600]
random_state = [42]
mlpClassifierParameters = functions.generateParameters(
    [
        activation,
        solver,
        alpha,
        learning_rate,
        learning_rate_init,
        hidden_layer_sizes,
        max_iter,
        random_state,
    ],
    globals()
)
# Voting?
estimators = [
    [
        (
            "1",
            functions.GradientBoostingClassifier(
                loss="exponential",
                learning_rate=0.01,
                n_estimators=200,
                random_state=42,
            ),
        ),
        ("2", functions.MLPClassifier(max_iter=800, random_state=42)),
        ("3", functions.QuadraticDiscriminantAnalysis()),
    ]
]
voting = ["soft"]

votingParameters = functions.generateParameters([estimators, voting], globals())

# AdaBoost


estimator = [functions.KNeighborsClassifier(n_neighbors=5), 
            functions.LinearDiscriminantAnalysis(),
            functions.QuadraticDiscriminantAnalysis(),
            ]
n_estimators = [50]
learning_rate = [0.1,0.01]
random_state = [42]

adaBoostParameters = functions.generateParameters(
    [estimator, n_estimators, learning_rate, random_state], globals()
)

models = [
    # {"model": functions.ModelType.LDA, "parameters": ldaParameters},
    # {"model": functions.ModelType.QDA, "parameters": qdaParameters},
    # {"model": functions.ModelType.KNN, "parameters": knnParameters},
    # {"model": functions.ModelType.SVC, "parameters": svcParameters},
    # {
      #  "model": functions.ModelType.GradientBoosting,
       #  "parameters": gradientBoostingParameters,
    # },
    # {
    #     "model": functions.ModelType.HistGradientBoosting,
    #     "parameters": histGradientBoostingParameters,
    # },
    {"model": functions.ModelType.MLPClassifier, "parameters": mlpClassifierParameters},
    {"model": functions.ModelType.ADABoost, "parameters": adaBoostParameters},
    # {"model": functions.ModelType.Voting, "parameters": votingParameters},
]
# KBest
k = [2, 3]
score_func = [mutual_info_classif]
kBestParameters = functions.generateParameters([score_func, k], globals())

# FPR

# mutual_info_classif seems to break for FPR but maybe try to run these without StandardScaler?
score_func = [f_classif]
alpha = [0.01]
fprParameters = functions.generateParameters([score_func, alpha], globals())


# RFE
estimator = [SVC(kernel="linear")]
n_features_to_select = [2, 3]
step = [0.9]
rfeParameters = functions.generateParameters(
    [estimator, n_features_to_select, step], globals()
)


featureSelectors = [
    {"model": functions.FeatureSelectorType.KBest, "parameters": kBestParameters},
    #{"model": functions.FeatureSelectorType.FPR, "parameters": fprParameters},
    #{"model": functions.FeatureSelectorType.RFE, "parameters": rfeParameters},
]
scalers = [
    # {"model": functions.Scaler.NoScaling, "parameters": [{}]},
    # {"model": functions.Scaler.Standard, "parameters": [{}]},
    {"model": functions.Scaler.Robust, "parameters": [{}]},
]

degree = [2]

polynomialParameters = functions.generateParameters([degree], globals())

featureGenerators = [
    {"model": functions.FeatureGenerator.NoFeatureGeneration, "parameters": [{}]},
    {
        "model": functions.FeatureGenerator.Polynomial,
        "parameters": polynomialParameters,
    },
]
### Conducting the experiment
y_train_ravel = np.ravel(y_train, order="C")
y_train_ravel = y_train_ravel.astype(int)
%%time
results = functions.conductExperimentsWithScalersAndGenerators(
    models=models,
    featureSelectors=featureSelectors,
    X_orig=X_train,
    y_orig=y_train_ravel,
    scalers=scalers,
    featureGenerators=featureGenerators,
    getLimitedScore=True,
    limit=0.45,

)
with open(RESULTS_FILENAME, "wb") as f:
    pickle.dump(results, f)
## Results
with open(RESULTS_FILENAME, "rb") as input_file:
    results = pickle.load(input_file)
resultsDf = pd.DataFrame(
    results,
    columns=functions.RESULTS_COLUMNS,
)
### Detailed Results
%%time
processedResultsDf, parameters = functions.extractParameterResultsArr(
    resultsDf,
    [models, featureSelectors, scalers, featureGenerators],
    [
        "model_parameters",
        "selector_parameters",
        "scaler_parameters",
        "feature_generator_parameters",
    ],
)
functions.drawParameterResultsBarplot(processedResultsDf, parameters)
### Score/Accuracy by number of features
functions.drawResultsPerNumberOfFeatures(processedResultsDf)
### Final Results for Feature Selectors
plt.title("Final comparison of feature selectors")
sns.boxplot(data=processedResultsDf, x="feature_selector", y="score")

plt.tight_layout()
plt.show()
sns.boxplot(data=processedResultsDf,x="model",y="score",hue="feature_generator")
sns.boxplot(data=processedResultsDf,x="model",y="score",hue="scaler")
### Final Results for Models
plt.title("Final comparison of models")
sns.boxplot(data=processedResultsDf, x="model", y="score")
plt.tight_layout()
plt.show()
### Final Results
%%time
filteredDf = functions.filterDataframeByBestResults(processedResultsDf)
processedResultsDf[processedResultsDf["scaler"]==functions.Scaler.NoScaling.name].sort_values(by="score",ascending=False).head(5)
filteredDf['scaler']
filteredDf['model_parameters'].to_numpy()
filteredDf['feature_generator']
filteredDf


%%time
filteredDf = functions.filterDataframeByBestResults(processedResultsDf)
processedResultsDf[processedResultsDf["scaler"]==functions.Scaler.NoScaling.name].sort_values(by="score",ascending=False).head(5)
filteredDf['scaler']
filteredDf['model_parameters'].to_numpy()
filteredDf['feature_generator']
filteredDf

