In [1]:
import pandas as pd 
import numpy as np 

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score

In [2]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


In [3]:
train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

Train:  (67447, 3906)
Test:  (18816, 3906)


In [4]:
train = pd.merge(train,train_ngram_features,on='sequence_id')
test = pd.merge(test,test_ngram_features,on='sequence_id')

print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 3948)
Test:  (18816, 3947)


In [5]:
FOLD = 10
df = pd.DataFrame()
for f_idx in tqdm(range(1,FOLD+1)):
    path = '../data/features/blast/'+str(FOLD)+'/dev_result_'+str(f_idx)+'.csv'
    new_df = pd.read_csv(path)
    df = pd.concat([df, new_df], ignore_index=True,axis=0)
del new_df
print(df.shape)

100%|██████████| 10/10 [06:22<00:00, 38.26s/it](66739, 14455)



In [6]:
df.sequence_id.duplicated().any()

False

In [7]:
%%time
#df = df.groupby('sequence_id', as_index=False).mean()
#print(df.shape)

Wall time: 0 ns


In [8]:
#00Q4V31Thits,00Q4V31Tidentity,00Q4V31Talignment length,00Q4V31Tmismatches,00Q4V31Tgap opens,00Q4V31Tq. start,00Q4V31Tq. end,00Q4V31Ts. start,00Q4V31Ts. end,00Q4V31Tevalue,00Q4V31Tbit score,
print(df.shape)
columnVals = df.columns.map(lambda x:('bit score' in x) | ('sequence_id' in x) | ('hits' in x))
#columnVals = df.columns.map(lambda x: ('hits' in x) | ('identity' in x) | ('alignment length' in x) | ('mismatches' in x) | ('gap opens' in x) | ('bit score in x' in x) | ('sequence_id' in x))
df = df.loc[:,columnVals]
df.shape

(66739, 14455)


(66739, 2629)

In [9]:
%%time
train = pd.merge(train,df,how='left',on='sequence_id')
del df
train.shape

Wall time: 6.25 s


(67447, 6576)

In [10]:
train.fillna(0,inplace=True)

In [26]:
def top10_accuracy_scorer(classes,y_d, preds):
    
    top10_idx = np.argpartition(preds, -10, axis=1)[:, -10:]
    
    top10_preds = classes[top10_idx]

    mask = top10_preds == np.reshape(np.array(y_d.values.ravel()),(y_d.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [12]:
# Rename our feature array
train.drop(['sequence','sequence_id'],inplace=True,axis=1)
test.drop(['sequence','sequence_id'],inplace=True,axis=1)

In [13]:
y = train['target']
X = train.drop('target',inplace=False,axis=1)

## FEATURES SELECTION

In [None]:
fs = SelectKBest(score_func=chi2, k=400)

X = fs.fit_transform(X,y)
test = fs.transform(test)
print(X.shape)
print(test.shape)   

In [None]:
clf = ExtraTreesClassifier(n_estimators=250,max_depth=18,verbose=1,n_jobs=11,random_state=420)
clf = clf.fit(X, y)

model = SelectFromModel(clf, prefit=True)

X = model.transform(X)
test = model.transform(test)
print(X.shape)
print(test.shape)     

In [None]:
del model
del clf

## Search hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {
                'n_estimators': [100,500,1000],
               #'max_features': max_features,
               'max_depth': [5,10,20],
               #-'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap
               }
print(random_grid)

rf = RandomForestClassifier()

rf_random = GridSearchCV(estimator = rf, param_distributions = random_grid, cv = 5, verbose=1, random_state=420, n_jobs = 10)

rf_random.fit(X, y)

In [None]:
print(rf_random.best_params_)
print(rf_random.best_score_)

del rf
del rf_random

### TRAINING

In [15]:
K = 5
skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)

In [27]:
test_preds = []
for i, (train_index, dev_index) in tqdm(enumerate(skf.split(X, y)),total=K):
    print('\n--------FOLD ',i+1)
    X_t, X_d = X.iloc[train_index], X.iloc[dev_index]
    y_t, y_d = y[train_index], y[dev_index]

    model = RandomForestClassifier(n_estimators=10,max_depth=3,verbose=0,n_jobs=11,random_state=420)

    model.fit(X_t, y_t)

    preds = model.predict(X_d)
    probas = model.predict_proba(X_d)

    acc = accuracy_score(y_d,preds)
    f1 = f1_score(y_d,preds,average='macro')
    top = top10_accuracy_scorer(model.classes_,y_d,probas)

    print('ACC: ',acc)
    print('F1: ', f1)
    print('TOP-10: ',top)

    test_preds.append(model.predict_proba(test))

0%|          | 0/5 [00:00<?, ?it/s]
--------FOLD  1
 20%|██        | 1/5 [00:13<00:52, 13.14s/it][[1261 1176  951 ...  666 1081    8]
 [1261 1176  951 ...  666 1081    8]
 [   8  148  615 ...  408  951  599]
 ...
 [   8  615  666 ...  951  880   31]
 [   8  880   59 ...  615  666  951]
 [1176    8 1081 ...   79  666  615]]
ACC:  0.21297257227575983
F1:  0.0038057047209148127
TOP-10:  0.37524091919940694

--------FOLD  2
 40%|████      | 2/5 [00:21<00:35, 11.85s/it][[1081  408   12 ... 1074  666   31]
 [   8  148   31 ...  408  666  615]
 [1081  408   12 ... 1074  666   31]
 ...
 [   8  148   31 ...  408  666  615]
 [ 909 1176    8 ...  666  951   31]
 [ 909 1176    8 ...  666  951   31]]
ACC:  0.21489992587101556
F1:  0.0044005426317803124
TOP-10:  0.38346923647146036

--------FOLD  3
 40%|████      | 2/5 [00:22<00:33, 11.28s/it]


KeyboardInterrupt: 

### SUBMISSION

In [None]:
test_preds = np.array(test_preds)
probas=np.mean(test_preds,axis=0)
probas.shape

In [None]:
submission_format = pd.read_csv('../data/submission_format.csv', index_col='sequence_id')

In [None]:
assert submission_format.shape == probas.shape
assert (model.classes_ == submission_format.columns).all()

In [None]:
my_submission = pd.DataFrame(data=probas, 
                             columns=model.classes_, 
                             index=submission_format.index)

In [None]:
my_submission.head()

In [None]:
my_submission.to_csv('../submissions/submissionX.csv')