In [7]:
import pandas as pd

# loading the dataset
x_train = pd.read_csv('CompleteDataset/x_train_all.csv')
y_train = pd.read_csv('CompleteDataset/y_train_all.csv')
x_test = pd.read_csv('CompleteDataset/x_test_all.csv')
y_test = pd.read_csv('CompleteDataset/y_test_all.csv')

In [8]:
# performs z-normalization on our dataset

# normalize x_train
mean=x_train.mean()
std=x_train.std()
x_train_norm=(x_train-mean)/std

# normalize x_test
mean=x_test.mean()
std=x_test.std()
x_test_norm=(x_test-mean)/std

In [9]:
from sklearn.ensemble import IsolationForest

# uses isolation forest algorithm to find outliers

feature_names = [f'{str(col)}' for col in x_train.columns]
x_train_norm.columns = feature_names

x_train_norm_copy = x_train_norm.copy()
contamination = 0.41
iso_forest = IsolationForest(contamination=contamination, random_state=42)

outliers = iso_forest.fit_predict(x_train_norm_copy)

outlier_indices = x_train_norm_copy.index[outliers == -1]

x_train_no_outliers_norm = x_train_norm_copy.drop(outlier_indices)
x_train_no_outliers_norm.to_csv('x_train_no_outliers_norm.csv')

outlier_indices_df = pd.DataFrame({'outlier_index_norm': outlier_indices})
outlier_indices_df.to_csv('outlier_indices_norm.csv', index=False)

In [10]:
# applies outlier mining to y train

# loads the indices to be removed from the dataset
indices_to_remove_norm = pd.read_csv('outlier_indices_norm.csv')

# removes outliers found after normalizing from y train
y_train_filtered_outlier_norm = y_train[~y_train.index.isin(indices_to_remove_norm['outlier_index_norm'])]

# saves to csv for future use
y_train_filtered_outlier_norm.to_csv('y_train_filtered_outlier_norm.csv')

In [11]:
# load y train without outliers
y_train_filtered_outlier_norm=pd.read_csv('y_train_filtered_outlier_norm.csv')

# drop previous indices
y_train_filtered_outlier_norm.drop('Unnamed: 0',axis=1, inplace=True)

In [12]:
from sklearn.neural_network import MLPClassifier

# experiment by changing:
# act func,
# no of layers,
# size of layers,
# learning rate,
# epochs,
# momentum,
# validation threshold TODO

# mlp_clf = MLPClassifier(solver="sgd", random_state=42)

In [26]:
from sklearn.metrics import accuracy_score

def vary_activation(x_train, y_train):
    for activation in ["identity", "logistic", "tanh", "relu"]:
        mlp_clf = MLPClassifier(
            random_state=42,
            solver="sgd",
            activation="relu",
            hidden_layer_sizes=(100,),
            learning_rate_init=0.001,
            max_iter=200,
            momentum=0.9
        )
        mlp_clf.fit(x_train, y_train)
        y_pred = mlp_clf.predict(x_test)
        print(activation + ": " + str(accuracy_score(y_test, y_pred)))


In [13]:
# baseline with deafult values for the hyperparameters (excl. solver, using regression as done in the lectures)
mlp_clf = MLPClassifier(
    random_state=42,
    solver="sgd",
    activation="relu",
    hidden_layer_sizes=(100,),
    learning_rate_init=0.001,
    max_iter=200,
    momentum=0.9
)

In [14]:
from sklearn.metrics import accuracy_score

mlp_clf.fit(x_train, y_train)
y_pred = mlp_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)


0.24271844660194175


In [15]:
mlp_clf.fit(x_train_norm, y_train)
y_pred = mlp_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)


0.5805825242718446




In [16]:
mlp_clf.fit(x_train_no_outliers_norm, y_train_filtered_outlier_norm)
y_pred = mlp_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

  y = column_or_1d(y, warn=True)


0.3482200647249191




In [27]:
vary_activation(x_train_norm, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


identity: 0.5805825242718446


  y = column_or_1d(y, warn=True)


logistic: 0.5805825242718446


  y = column_or_1d(y, warn=True)


tanh: 0.5805825242718446
relu: 0.5805825242718446




In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

def grid_search(clf, x_train, y_train):
    # transform pandas dataset to a 1d numpy array
    y_train = y_train.to_numpy().ravel()
    # y_test = y_test.to_numpy().ravel()

    # dictionary of parameters to be varied (keys), and arrays of possible values (values)
    param_grid = {
        # number of layers and their sizes
        "hidden_layer_sizes": [[50], [100], [150], [50, 50], [100, 100], [150, 150]],
        # activation functions
        "activation": ["identity", "logistic", "tanh", "relu"],
        # solver, "sgd" is the only solver to use momentum, "adam" is the only other to use epochs
        # "solver": ["sgd"],
        # not required
        # "alpha": [0.0001, 0.05],
        # learning rate
        # "learning_rate": ["constant","adaptive"],
        # exponent use for invscaling
        "learning_rate_init": [0.001],
        # max epochs with no improvement before stopping
        # "n_iter_no_change": [10],
        # max number of epochs
        "max_iter": [200],
        # momentum
        "momentum": [0.9]
    }