# Introduction to Machine Learning - Task 3

Group name: Cbbayes

Team members: mcolomer (mcolomer@student.ethz.ch), pratsink (pratsink@student.ethz.ch) and scastro (scastro@student.ethz.ch)

Spring 2021

## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

## Preprocessing: one-hot encoding of the sequences

In [6]:
# Indicate the path to the data file
path = "../data/"

train_data = pd.read_csv(path+"train.csv")
test_data = pd.read_csv(path+"test.csv")

aminoacids = ["R", "H", "K","D","E", "S","T","N","Q","C","U",
             "G","P","A","I","L","M","F","W","Y","V"]

charge = [1,1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
polarity=[1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
special=[0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]


#Preprocessing training dataset

X_train = train_data["Sequence"].str.split("", n = 4, expand = True)

def map_charge(aa, charge, aminoacids):
    i = aminoacids.index(aa)
    return charge[i]

def map_polarity(aa, polarity, aminoacids):
    i = aminoacids.index(aa)
    return polarity[i]

def map_special(aa, special, aminoacids):
    i = aminoacids.index(aa)
    return special[i]

for i in [1,2,3,4]:
    train_data["pos"+str(i)] = X_train[i]
    train_data = pd.concat([train_data,pd.get_dummies(train_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    train_data["charge"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    train_data["polarity"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_polarity(aa, polarity, aminoacids))
    train_data["special"+str(i)] = train_data["pos"+str(i)].apply(lambda aa: map_special(aa, special, aminoacids))
    train_data = train_data.drop(columns=["pos"+str(i)])
    
X_train = train_data.iloc[:, 2:-1].values 
y_train = train_data.iloc[:,1].values 

#Preprocessing test dataset

X_test = test_data["Sequence"].str.split("", n = 4, expand = True)


for i in [1,2,3,4]:
    test_data["pos"+str(i)] = X_test[i]
    test_data = pd.concat([test_data,pd.get_dummies(test_data["pos"+str(i)], prefix="pos"+str(i))],axis=1)
    test_data["charge"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_charge(aa, charge, aminoacids))
    test_data["polarity"+str(i)] = test_data["pos"+str(i)].apply(lambda aa: map_polarity(aa, polarity, aminoacids))
    test_data = test_data.drop(columns=["pos"+str(i)])
    
X_test = test_data.iloc[:, 1:-1].values 


In [4]:
train_data

Unnamed: 0,Sequence,Active,pos1_A,pos1_C,pos1_D,pos1_E,pos1_F,pos1_G,pos1_H,pos1_I,...,pos4_Q,pos4_R,pos4_S,pos4_T,pos4_V,pos4_W,pos4_Y,charge4,polarity4,special4
0,DKWL,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,FCHN,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,KDQP,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,FNWI,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NKRM,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111995,GSME,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,-1,1,0
111996,DLPT,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
111997,SGHC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
111998,KIGT,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1


In [7]:
X_test

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
print(len(charge))
print(len(polarity))
print(len(special))

21
21
21


## Classification

Class are quite unbalanced:

In [9]:
print("Ratio positive train:", sum(y_train)/len(y_train))

Ratio positive train: 0.03761607142857143


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [100, 250]
}
# Create a based model
rf = RandomForestClassifier(class_weight='balanced')
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring="f1")

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

y_test = grid_search.predict(X_test)

print("Ratio positive train:", sum(y_test)/len(y_test))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [93]:
pd.DataFrame(y_test).to_csv('output_RFC_250est_3features.csv', index = False, header = False)             ## Save the results dataframe to a csv