# Introduction to Machine Learning - Task 3

Group name: Cbbayes

Team members: mcolomer (mcolomer@student.ethz.ch), pratsink (pratsink@student.ethz.ch) and scastro (scastro@student.ethz.ch)

Spring 2021

## Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.neural_network import MLPClassifier

## Preprocessing: one-hot encoding of the sequences

In [12]:
# Indicate the path to the data file
path = "../data/"

train_data = pd.read_csv(path+"train.csv")
test_data = pd.read_csv(path+"test.csv")

aminoacids = ["R", "H", "K","D","E", "S","T","N","Q","C","U",
             "G","P","A","I","L","M","F","W","Y","V"]

charge = [1,1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
polarity=[1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
special=[0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]


#Preprocessing training dataset

X_train = train_data["Sequence"].str.split("", n = 4, expand = True)

def map_charge(aa, charge, aminoacids):
    i = aminoacids.index(aa)
    return charge[i]

def map_polarity(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return polarity[i]

def map_special(aa, special, aminoacids):
    i = aminoacids.index(aa)
    return special[i]


for i in [1,2,3,4]:
    train_data["pos"+str(i)] = X_train[i]
    train_data = pd.concat([train_data,pd.get_dummies(train_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    train_data["charge"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    train_data["polarity"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_polarity(aa, polarity, aminoacids))
    train_data["special"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))
    train_data = train_data.drop(columns=["pos"+str(i)])
    
X_train = train_data.iloc[:, 2:-1].values 
y_train = train_data.iloc[:,1].values 

#Preprocessing test dataset

X_test = test_data["Sequence"].str.split("", n = 4, expand = True)


for i in [1,2,3,4]:
    test_data["pos"+str(i)] = X_test[i]
    test_data = pd.concat([test_data,pd.get_dummies(test_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    test_data["charge"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    test_data["polarity"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_polarity(aa, polarity, aminoacids))
    test_data["special"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))    
    test_data = test_data.drop(columns=["pos"+str(i)])
    
X_test = test_data.iloc[:, 1:-1].values 


In [13]:
print(len(charge))
print(len(polarity))
print(len(special))

21
21
21


## Classification

Class are quite unbalanced:

In [14]:
print(len(y_train))
print(sum(y_train))

112000
4213


In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.neural_network import MLPClassifier

# Create the parameter grid based on the results of random search 
param_grid = {
    'hidden_layer_sizes': [50, 100, 200],
    "activation": ["relu","tanh"]
}
# Create a based model
cf = MLPClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = cf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2, scoring="f1")

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

y_test = grid_search.predict(X_test)

print("Ratio positive train:", sum(y_test)/len(y_test))

print(grid_search.cv_results_)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
{'activation': 'relu', 'hidden_layer_sizes': 200}
0.873475580285044
Ratio positive train: 0.0375625
{'mean_fit_time': array([ 90.23113453,  81.59921944, 124.18004191, 120.43785846,
       118.76266253, 151.55371642]), 'std_fit_time': array([1.95176852, 5.75963843, 3.49546707, 0.84179842, 1.45810759,
       4.20676041]), 'mean_score_time': array([0.25277555, 0.50424051, 0.45561647, 0.32310402, 0.35368836,
       0.36651599]), 'std_score_time': array([0.01036942, 0.21191525, 0.02559948, 0.00031197, 0.00053847,
       0.00954092]), 'param_activation': masked_array(data=['relu', 'relu', 'relu', 'tanh', 'tanh', 'tanh'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_hidden_layer_sizes': masked_array(data=[50, 100, 200, 50, 100, 200],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'acti

In [16]:
pd.DataFrame(y_test).to_csv('output_MLP_relu_200hidden.csv', index = False, header = False)             ## Save the results dataframe to a csv