## Import libs

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

## Process data

|Name|Arginine|Histidine|Lysine|Aspartic acid|Glutamatic acid|Serine|Threonine|Asparagine|Glutamine|Cysteine|Selenocysteine|Glycine|Proline|Alanine|Isoleucine|Leucine|Methionine|Phenylalanine|Tryptophan|Tyrosine|Valine|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|Col num|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|
|Symbol       |R|H|K|D|E|S|T|N|Q|C|U|G|P|A|I|L|M|F|W|Y|V|
|Charge       |1|1|1|-1|-1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|
|Polar        |0|0|0|0|0|1|1|1|1|1|0|0|1|0|0|0|0|0|0|0|0|
|Non-Polar    |0|0|0|0|0|0|0|0|0|0|0|1|0|1|1|1|1|1|1|1|1|
|Aromatic     |0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|1|1|0|
|Special cases|0|0|0|0|0|0|0|0|0|1|1|1|1|0|0|0|0|0|0|0|0|

In [2]:
## Read data
# Indicate the path to the data file
path = "../data/"

train_data = pd.read_csv(path+"train.csv")
test_data = pd.read_csv(path+"test.csv")

aminoacids = ["R","H","K","D","E", "S","T","N","Q","C","U","G","P","A","I","L","M","F","W","Y","V"]

# Marionas
#charge =   [1,1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
#polar=     [1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
#special=   [0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]

# Santis
charge =    [1,1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
polar =     [0,0,0,0,0,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0]
non_polar = [0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1]
aromatic =  [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0]
special =   [0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]

In [3]:
#Preprocessing training dataset
X_train = train_data["Sequence"].str.split("", n = 4, expand = True)

def map_charge(aa, charge, aminoacids):
    i = aminoacids.index(aa)
    return charge[i]

def map_polar(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return polar[i]

def map_non_polar(aa, charge, aminoacids):
    i = aminoacids.index(aa)
    return non_polar[i]

def map_aromatic(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return aromatic[i]

def map_special(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return special[i]

for i in [1,2,3,4]:
    train_data["pos"+str(i)] = X_train[i]
    train_data = pd.concat([train_data,pd.get_dummies(train_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    train_data["charge"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    train_data["polar"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_polar(aa, polar, aminoacids))
    train_data["non_polar"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_non_polar(aa, non_polar, aminoacids))
    train_data["aromatic"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_aromatic(aa, aromatic, aminoacids))
    train_data["special"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))
    train_data = train_data.drop(columns=["pos"+str(i)])

In [4]:
train_data

Unnamed: 0,Sequence,Active,pos1_A,pos1_C,pos1_D,pos1_E,pos1_F,pos1_G,pos1_H,pos1_I,...,pos4_S,pos4_T,pos4_V,pos4_W,pos4_Y,charge4,polar4,non_polar4,aromatic4,special4
0,DKWL,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,FCHN,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,KDQP,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,FNWI,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,NKRM,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111995,GSME,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,-1,0,0,0,0
111996,DLPT,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
111997,SGHC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
111998,KIGT,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [5]:
X_train = train_data.iloc[:, 2:101].values 
y_train = train_data.iloc[:,1].values 

In [6]:
#Preprocessing test dataset
X_test = test_data["Sequence"].str.split("", n = 4, expand = True)

for i in [1,2,3,4]:
    test_data["pos"+str(i)] = X_test[i]
    test_data = pd.concat([test_data,pd.get_dummies(test_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    test_data["charge"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    test_data["polar"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_polar(aa, polar, aminoacids))
    test_data["non_polar"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_non_polar(aa, non_polar, aminoacids))
    test_data["aromatic"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_aromatic(aa, aromatic, aminoacids))
    test_data["special"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))
    test_data = test_data.drop(columns=["pos"+str(i)])

In [7]:
X_test = test_data.iloc[:, 1:100].values 

## Fit model 

In [29]:
# Create the parameter grid based on the results of random search 
param_grid = {'hidden_layer_sizes': [200, 300],
              "activation": ["relu","logistic"],
              'random_state' : [123]}

# Create a based model
cf = MLPClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = cf, 
                           param_grid = param_grid, 
                           scoring="f1",
                           cv = 5, 
                           n_jobs = -1)

grid_search.fit(X_train, y_train)

{'activation': 'logistic', 'hidden_layer_sizes': 200, 'random_state': 123}


In [30]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'activation': 'logistic', 'hidden_layer_sizes': 200, 'random_state': 123}
0.9025551691409321


In [31]:
y_test = grid_search.predict(X_test)

## Predict and write to output

In [33]:
pd.DataFrame(y_test).to_csv('../output/MLPC_200ly_sig_san_feat.csv', index = False, header = False)

## Results log

* 250 estimators, balanced_subsample, only chemical properties of aa: oob = 0.8661517857142857, 7739 predicted active, lb score = 0.33 
* 250 estimators, balanced_subsample, chemical properties and aa id's: oob = 0.6160089285714285, 19184 predicted active, lb score = 0.12
* 200 leyers, relu, 0.89 in final score. Features as descibed here perfomred better.  
* 200 layers, logistic, 0.95 in final score. 300 layers did not performe better than 200. To be explored if with this logistic there is a better perrofmance with a smalle number of inner layers. 