In [1]:
import pandas as pd 
import numpy as np 

import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score

In [2]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


In [3]:
train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

Train:  (67447, 3906)
Test:  (18816, 3906)


In [4]:
train = pd.merge(train,train_ngram_features,on='sequence_id')
test = pd.merge(test,test_ngram_features,on='sequence_id')

print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 3948)
Test:  (18816, 3947)


In [5]:
FOLD = 10
df = pd.DataFrame()
for f_idx in tqdm(range(1,FOLD+1)):
    path = '../data/features/blast/'+str(FOLD)+'/dev_result_'+str(f_idx)+'.csv'
    new_df = pd.read_csv(path)
    df = pd.concat([df, new_df], ignore_index=True,axis=0)
del new_df
print(df.shape)

100%|██████████| 10/10 [06:21<00:00, 38.17s/it](66739, 14455)



In [6]:
df.sequence_id.duplicated().any()

False

In [7]:
%%time
#df = df.groupby('sequence_id', as_index=False).mean()
#print(df.shape)

Wall time: 0 ns


In [8]:
#00Q4V31Thits,00Q4V31Tidentity,00Q4V31Talignment length,00Q4V31Tmismatches,00Q4V31Tgap opens,00Q4V31Tq. start,00Q4V31Tq. end,00Q4V31Ts. start,00Q4V31Ts. end,00Q4V31Tevalue,00Q4V31Tbit score,
print(df.shape)
columnVals = df.columns.map(lambda x: ('hits' in x) | ('identity' in x) | ('alignment length' in x) | ('mismatches' in x) | ('gap opens' in x) | ('bit score in x' in x) | ('sequence_id' in x))
df = df.loc[:,columnVals]
df.shape

(66739, 14455)


(66739, 6571)

In [9]:
%%time
train = pd.merge(train,df,how='left',on='sequence_id')
del df
train.shape

Wall time: 17.1 s


(67447, 10518)

In [10]:
train.fillna(0,inplace=True)

In [11]:
def top10_accuracy_scorer(estimator, X, y):

    probas = estimator.predict_proba(X)
    
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    top10_preds = estimator.classes_[top10_idx]

    mask = top10_preds == np.reshape(np.array(y.values.ravel()),(y.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [12]:
# Rename our feature array
train.drop(['sequence','sequence_id'],inplace=True,axis=1)
test.drop(['sequence','sequence_id'],inplace=True,axis=1)

In [13]:
y = train['target']
X = train.drop('target',inplace=False,axis=1)

In [14]:
K = 5
skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)

In [17]:
for i, (train_index, dev_index) in tqdm(enumerate(skf.split(X, y)),total=K):
    print('\n--------FOLD ',i+1)
    X_t, X_d = X.iloc[train_index], X.iloc[dev_index]
    y_t, y_d = y[train_index], y[dev_index]

    #model = xgb.XGBClassifier(n_estimators=2,objective='multi:softprob',eval_metric="mlogloss",max_depth=3,tree_method='gpu_hist',gpu_id=0,verbosity=1,n_jobs=10,random_state=420)
    model = RandomForestClassifier(n_estimators=600,max_depth=25,verbose=0,n_jobs=10,random_state=420)

    model.fit(X_t, y_t)

    preds = model.predict(X_d)

    acc = accuracy_score(y_d,preds)
    f1 = f1_score(y_d,preds,average='macro')
    top = top10_accuracy_scorer(model, X_d, y_d)

    print('ACC: ',acc)
    print('F1: ', f1)
    print('TOP-10: ',top)

0%|          | 0/5 [00:00<?, ?it/s]
--------FOLD  1
  0%|          | 0/5 [05:27<?, ?it/s]


MemoryError: could not allocate 86114304 bytes