# Introduction to Machine Learning - Task 3

Group name: Cbbayes

Team members: mcolomer (mcolomer@student.ethz.ch), pratsink (pratsink@student.ethz.ch) and scastro (scastro@student.ethz.ch)

Spring 2021

## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

## Preprocessing: one-hot encoding of the sequences

For the preprocessing, we one-hot encode the sequences taking into account their position in the sequence. We also determine the type of aminoacid considering the follow physicochemical properties:

- Charge
- Polar
- Non-polar
- Aromatic
- Special

In [2]:
# Indicate the path to the data files
path = "../data/"

# Properties of the aminoacids
aminoacids = ["R", "H", "K","D","E", "S","T","N","Q","C","U",
             "G","P","A","I","L","M","F","W","Y","V"]

charge =    [1,1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
polar =     [0,0,0,0,0,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0]
non_polar = [0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1]
aromatic =  [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0]
special =   [0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]


# Functions to map the characteristics of the aminoacid in a particular position
def map_charge(aa, charge, aminoacids):
    i = aminoacids.index(aa)
    return charge[i]

def map_polar(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return polar[i]

def map_non_polar(aa, charge, aminoacids):
    i = aminoacids.index(aa)
    return non_polar[i]

def map_aromatic(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return aromatic[i]

def map_special(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return special[i]

### Pre-processing training dataset

In [3]:
# Open the file with the training data
train_data = pd.read_csv(path+"train.csv")

#Preprocessing training dataset
X_train = train_data["Sequence"].str.split("", n = 4, expand = True)

for i in [1,2,3,4]:
    train_data["pos"+str(i)] = X_train[i]
    train_data = pd.concat([train_data,pd.get_dummies(train_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    train_data["charge"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    train_data["polar"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_polar(aa, polar, aminoacids))
    train_data["non_polar"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_non_polar(aa, non_polar, aminoacids))
    train_data["aromatic"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_aromatic(aa, aromatic, aminoacids))
    train_data["special"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))
    train_data = train_data.drop(columns=["pos"+str(i)])
    
# Separate features and class labes of the train dataset
X_train = train_data.iloc[:, 2:-1].values 
y_train = train_data.iloc[:,1].values 

train_data.head()

Unnamed: 0,Sequence,Active,pos1_A,pos1_C,pos1_D,pos1_E,pos1_F,pos1_G,pos1_H,pos1_I,...,pos4_S,pos4_T,pos4_V,pos4_W,pos4_Y,charge4,polar4,non_polar4,aromatic4,special4
0,DKWL,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,FCHN,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,KDQP,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,FNWI,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,NKRM,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Pre-processing test dataset

In [4]:
#Open the file with the test data
test_data = pd.read_csv(path+"test.csv")

# Preprocessing test dataset
X_test = test_data["Sequence"].str.split("", n = 4, expand = True)

for i in [1,2,3,4]:
    test_data["pos"+str(i)] = X_test[i]
    test_data = pd.concat([test_data,pd.get_dummies(test_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    test_data["charge"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    test_data["polar"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_polar(aa, polar, aminoacids))
    test_data["non_polar"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_non_polar(aa, non_polar, aminoacids))
    test_data["aromatic"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_aromatic(aa, aromatic, aminoacids))
    test_data["special"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))
    test_data = test_data.drop(columns=["pos"+str(i)])
    
# Select features of test dataset
X_test = test_data.iloc[:, 1:100].values 

test_data.head()

Unnamed: 0,Sequence,pos1_A,pos1_C,pos1_D,pos1_E,pos1_F,pos1_G,pos1_H,pos1_I,pos1_K,...,pos4_S,pos4_T,pos4_V,pos4_W,pos4_Y,charge4,polar4,non_polar4,aromatic4,special4
0,HWFK,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,MWPW,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
2,ALDV,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,NTLG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,LHYY,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0


## Classification

The classification is done using a multi-layer preceptron classifier. We also tried a Random Forest classifier, but our results for the F1-score were worse. We use GridSearchCV to determine the best parameters for our training dataset. 

In [5]:
# Create the parameter grid based on the results of random search 
param_grid = {'hidden_layer_sizes': [100, 200, 300],
              "activation": ["relu","logistic"],
              'random_state' : [123]}

# Create a based model
cf = MLPClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = cf, 
                           param_grid = param_grid, 
                           scoring="f1",
                           cv = 5, 
                           n_jobs = -1)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'activation': 'logistic', 'hidden_layer_sizes': 200, 'random_state': 123}
0.9025551691409321


## Predict and write to output

In [7]:
y_test = grid_search.predict(X_test)
pd.DataFrame(y_test).to_csv('../output/MLPC_200ly_sig_feat.csv', index = False, header = False)

## Results log

Random Forest classifier:
* 250 estimators, balanced_subsample, only chemical properties of aa: oob = 0.8661517857142857, 7739 predicted active, lb score = 0.33 
* 250 estimators, balanced_subsample, chemical properties and aa id's: oob = 0.6160089285714285, 19184 predicted active, lb score = 0.12


Multi-Layer Perceptron classifier:
* 200 layers, relu, 0.89 in final score. Features as descibed here perfomred better.  
* 200 layers, logistic, 0.95 in final score. 300 layers did not performe better than 200. To be explored if with this logistic there is a better performance with a small number of inner layers. 