In [1]:
import pandas as pd 
import numpy as np 

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score

In [2]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


## N-GRAMS

In [3]:
train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

Train:  (67447, 3906)
Test:  (18816, 3906)


In [4]:
train = pd.merge(train,train_ngram_features,on='sequence_id')
test = pd.merge(test,test_ngram_features,on='sequence_id')

print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 3948)
Test:  (18816, 3947)


## TOKENS

In [5]:
from transformers import RobertaTokenizer
import multiprocessing as mp

import aux_w

In [14]:
rows = []
tar = []

pool = mp.Pool(10)

with tqdm(total=train.shape[0]) as pbar:
    for i, row in enumerate(pool.imap_unordered(aux_w.tokenize_worker, zip(train.sequence.values,train.sequence_id.values))):
        rows.append(row[0])
        tar.append(row[1])
        pbar.update(1)

pool.close()
pool.join()

df = pd.DataFrame(rows)
df['sequence_id'] = tar
df.shape

100%|██████████| 67447/67447 [1:55:17<00:00,  9.75it/s]


(67447, 1001)

In [None]:
train = pd.merge(train,df,axis=1,on='sequence_id')
train.shape

In [None]:
rows = []
tar = []

pool = mp.Pool(10)

with tqdm(total=test.shape[0]) as pbar:
    for i, row in enumerate(pool.imap_unordered(aux_w.tokenize_worker, zip(test.sequence.values,test.sequence_id.values))):
        rows.append(row[0])
        tar.append(row[1])
        pbar.update(1)

pool.close()
pool.join()

df = pd.DataFrame(rows)
df['sequence_id'] = tar
df.shape

In [None]:
test = pd.merge(test,df,axis=1,on='sequence_id')
test.shape

## BLAST

In [5]:
FOLD = 10
df = pd.DataFrame()
for f_idx in tqdm(range(1,FOLD+1)):
    path = '../data/features/blast/'+str(FOLD)+'/dev_result_'+str(f_idx)+'.csv'
    new_df = pd.read_csv(path)
    df = pd.concat([df, new_df], ignore_index=True,axis=0)
del new_df
print(df.shape)

100%|██████████| 10/10 [11:23<00:00, 68.33s/it]
(66739, 14455)


In [6]:
df.sequence_id.duplicated().any()

False

In [7]:
#00Q4V31Thits,00Q4V31Tidentity,00Q4V31Talignment length,00Q4V31Tmismatches,00Q4V31Tgap opens,00Q4V31Tq. start,00Q4V31Tq. end,00Q4V31Ts. start,00Q4V31Ts. end,00Q4V31Tevalue,00Q4V31Tbit score,
print(df.shape)
columnVals = df.columns.map(lambda x:('bit score' in x) | ('sequence_id' in x)
#columnVals = df.columns.map(lambda x: ('hits' in x) | ('identity' in x) | ('alignment length' in x) | ('mismatches' in x) | ('gap opens' in x) | ('bit score in x' in x) | ('sequence_id' in x))
df = df.loc[:,columnVals]
df.shape

SyntaxError: invalid syntax (<ipython-input-7-202fd32f9920>, line 5)

In [None]:
%%time
train = pd.merge(train,df,how='left',on='sequence_id')
del df
train.shape

In [None]:
path = '../data/features/blast/test/blast_result.csv'
df = pd.read_csv(path)
columnVals = columnVals = df.columns.map(lambda x:('bit score' in x) | ('sequence_id' in x)
df = df.loc[:,columnVals]
print(df.shape)
test = pd.merge(test,df,how='left',on='sequence_id')
del df
test.shape

In [None]:
train.fillna(0,inplace=True)
test.fillna(0,inplace=True)

In [None]:
def top10_accuracy_scorer(classes,y_d, preds):
    
    top10_idx = np.argpartition(preds, -10, axis=1)[:, -10:]
    
    top10_preds = classes[top10_idx]

    mask = top10_preds == np.reshape(np.array(y_d.values.ravel()),(y_d.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [None]:
# Rename our feature array
train.drop(['sequence','sequence_id'],inplace=True,axis=1)
test.drop(['sequence','sequence_id'],inplace=True,axis=1)

In [None]:
y = train['target']
X = train.drop('target',inplace=False,axis=1)

In [13]:
X.iloc[0:5,0:39]

Unnamed: 0,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,growth_strain_ccdb_survival,growth_strain_dh10b,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## FEATURES SELECTION

In [None]:
fs = SelectKBest(score_func=chi2, k=400)

X = fs.fit_transform(X,y)
test = fs.transform(test)
print(X.shape)
print(test.shape)   

In [None]:
clf = ExtraTreesClassifier(n_estimators=250,max_depth=18,verbose=1,n_jobs=11,random_state=420)
clf = clf.fit(X, y)

model = SelectFromModel(clf, prefit=True)

X = model.transform(X)
test = model.transform(test)
print(X.shape)
print(test.shape)     

In [None]:
del model
del clf

## Search hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {
                'n_estimators': [100,500,1000],
               #'max_features': max_features,
               'max_depth': [5,10,20],
               #-'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap
               }
print(random_grid)

rf = RandomForestClassifier()

rf_random = GridSearchCV(estimator = rf, param_distributions = random_grid, cv = 5, verbose=1, random_state=420, n_jobs = 10)

rf_random.fit(X, y)

In [None]:
print(rf_random.best_params_)
print(rf_random.best_score_)

del rf
del rf_random

### TRAINING

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import torch

In [15]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y,stratify=y, test_size=0.2)

In [16]:
model = TabNetClassifier(
    n_d=300,
    n_a=300,
    n_steps=10,
    cat_idxs=np.indices([39])[0].tolist(),
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95,
                     "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
)

Device used : cuda


In [17]:
model.fit(
    X_train=X_train.values, y_train=y_train.values,
    X_valid=X_dev.values, y_valid=y_dev.values,
    max_epochs=200, patience=10,
    batch_size=512, virtual_batch_size=64
)

Will train until validation stopping metric hasn't improved in 10 rounds.
---------------------------------------
| EPOCH |  train  |   valid  | total time (s)
| 1     | 0.05234 |  0.03944 |   74.7      
| 2     | 0.16157 |  0.15263 |   150.9     
| 3     | 0.18952 |  0.21801 |   237.7     
| 4     | 0.23041 |  0.23395 |   323.5     
| 5     | 0.23993 |  0.24374 |   406.3     
| 6     | 0.24666 |  0.26620 |   492.2     
| 7     | 0.27227 |  0.28777 |   578.6     
| 8     | 0.29292 |  0.30519 |   667.4     
| 9     | 0.30882 |  0.33113 |   751.2     
| 10    | 0.32165 |  0.28406 |   837.5     
| 11    | 0.33225 |  0.31616 |   920.6     
| 12    | 0.34198 |  0.33818 |   1003.6    
| 13    | 0.34919 |  0.36901 |   1084.5    
| 14    | 0.35002 |  0.35107 |   1164.4    
| 15    | 0.36056 |  0.36501 |   1243.8    


KeyboardInterrupt: 

In [None]:
plt.plot(model.history['train']['loss'])
plt.plot(model.history['valid']['loss'])

In [None]:
# plot accuracies
plt.plot([-x for x in model.history['train']['metric']])
plt.plot([-x for x in model.history['valid']['metric']])

In [None]:
probas = model.predict_proba(X_dev.values)
top = top10_accuracy_scorer(model.classes_,y_dev,probas)
top

## RF

In [None]:
K = 5
skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)

In [None]:
test_preds = []
for i, (train_index, dev_index) in tqdm(enumerate(skf.split(X, y)),total=K):
    print('\n--------FOLD ',i+1)
    X_t, X_d = X.iloc[train_index], X.iloc[dev_index]
    y_t, y_d = y[train_index], y[dev_index]

    model = RandomForestClassifier(n_estimators=100,max_depth=15,verbose=0,n_jobs=11,random_state=420)

    model.fit(X_t, y_t)

    preds = model.predict(X_d)
    probas = model.predict_proba(X_d)

    acc = accuracy_score(y_d,preds)
    f1 = f1_score(y_d,preds,average='macro')
    top = top10_accuracy_scorer(model.classes_,y_d,probas)

    print('ACC: ',acc)
    print('F1: ', f1)
    print('TOP-10: ',top)

    test_preds.append(model.predict_proba(test))

## SUBMISSION

In [None]:
test_preds = np.array(test_preds)
probas=np.mean(test_preds,axis=0)
probas.shape

In [48]:
probas = model.predict_proba(test.values)

In [49]:
submission_format = pd.read_csv('../data/raw/submission_format.csv', index_col='sequence_id')

In [50]:
assert submission_format.shape == probas.shape
assert (model.classes_ == submission_format.columns).all()

In [51]:
my_submission = pd.DataFrame(data=probas, 
                             columns=model.classes_, 
                             index=submission_format.index)

In [52]:
my_submission.head()

Unnamed: 0_level_0,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,0CL7QVG8,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E0VFT,1.5150820000000001e-27,5.6259910000000006e-27,1.891753e-43,1.649788e-21,1.202046e-25,1.715993e-28,4.014787e-10,0.0,7.314778e-42,4.400077e-43,...,3.056437e-38,0.0,2.559326e-24,8.209619e-37,1.9084439999999998e-36,5.0745839999999995e-26,0.0,1.033698e-31,7.006492e-45,7.284635e-18
TTRK5,0.0,0.0,3.775939e-41,3.296548e-37,0.0,1.401298e-45,9.42934e-35,1.758771e-28,1.044531e-30,0.0,...,4.0160870000000003e-32,0.0,2.002628e-34,0.0,0.0,0.0,0.0,5.943285e-37,0.0,6.717345e-14
2Z7FZ,9.471412e-21,5.500528999999999e-20,4.124677e-34,3.916077e-16,2.158911e-25,6.725175e-16,0.002752208,6.953893999999999e-38,4.2523799999999996e-26,9.019391e-37,...,1.12151e-21,0.0,3.367852e-18,8.205101e-39,3.291074e-21,3.0681860000000004e-17,0.0,1.188096e-32,0.0,6.623164e-11
VJI6E,0.0,8.836620999999999e-38,2.400263e-39,0.0,4.932411e-31,0.0,5.3677079999999995e-36,0.0,6.043064e-06,1.401298e-45,...,4.400077e-43,1.401298e-45,8.398241e-12,0.0,7.787215e-35,2.963532e-33,0.0,7.804918e-18,5.802032e-16,4.3621300000000003e-23
721FI,7.235823e-24,9.087499999999999e-24,2.219892e-21,4.2679539999999997e-19,4.867489e-14,6.668676e-31,1.094942e-15,1.479028e-18,1.1449e-12,2.3221240000000003e-23,...,4.085907e-12,1.424093e-27,6.358644e-06,2.5272130000000002e-27,1.3440110000000002e-28,1.2889210000000002e-31,1.84406e-19,4.458698e-15,7.075241000000001e-33,2.122319e-15


In [53]:
my_submission.to_csv('../submissions/submission_tabnet.csv')