In [1]:
import pandas as pd 
import numpy as np 

import xgboost as xgb

import lightgbm

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score

## LOAD DATA

In [2]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


In [3]:
train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

Train:  (67447, 3906)
Test:  (18816, 3906)


In [4]:
train = pd.merge(train,train_ngram_features,on='sequence_id')
test = pd.merge(test,test_ngram_features,on='sequence_id')

print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 3948)
Test:  (18816, 3947)


In [5]:
def top10_accuracy_scorer(estimator, X, y):

    probas = estimator.predict_proba(X)
    
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    top10_preds = estimator.classes_[top10_idx]

    mask = top10_preds == np.reshape(np.array(y.values.ravel()),(y.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [6]:
# Rename our feature array
train.drop(['sequence','sequence_id'],inplace=True,axis=1)
test.drop(['sequence','sequence_id'],inplace=True,axis=1)

In [7]:
y = train['target']
X = train.drop('target',inplace=False,axis=1)

## Feature selection

In [8]:
fs = SelectKBest(score_func=chi2, k=400)

X = fs.fit_transform(X,y)
test = fs.transform(test)
print(X.shape)
print(test.shape)   

(67447, 400)
(18816, 400)


In [8]:
clf = ExtraTreesClassifier(n_estimators=250,max_depth=18,verbose=1,n_jobs=11,random_state=420)
clf = clf.fit(X, y)

model = SelectFromModel(clf, prefit=True)

X = model.transform(X)
test = model.transform(test)
print(X.shape)
print(test.shape)     

[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:   12.7s
[Parallel(n_jobs=11)]: Done 178 tasks      | elapsed:  1.4min
[Parallel(n_jobs=11)]: Done 250 out of 250 | elapsed:  2.1min finished
(67447, 1396)
(18816, 1396)


In [9]:
del model
del clf

## Search hyperparameters

In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {
                'n_estimators': [100,500,1000],
               #'max_features': max_features,
               'max_depth': [5,10,20],
               #-'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
               #'bootstrap': bootstrap
               }
print(random_grid)

rf = RandomForestClassifier()

rf_random = GridSearchCV(estimator = rf, param_distributions = random_grid, cv = 5, verbose=1, random_state=420, n_jobs = 10)

rf_random.fit(X, y)

{'n_estimators': [100, 500, 1000], 'max_depth': [5, 10, 20]}


TypeError: __init__() got an unexpected keyword argument 'param_distributions'

In [None]:
print(rf_random.best_params_)
print(rf_random.best_score_)

del rf
del rf_random

## TRAINING

In [10]:
K = 5
skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)

In [11]:
test_preds = []

for i, (train_index, dev_index) in tqdm(enumerate(skf.split(X, y)),total=K):
    print('\n--------FOLD ',i+1)
    X_t, X_d = X[train_index], X[dev_index]
    y_t, y_d = y[train_index], y[dev_index]

    #model = xgb.XGBClassifier(n_estimators=2,objective='multi:softprob',eval_metric="mlogloss",max_depth=3,tree_method='hist',gpu_id=0,verbosity=1,n_jobs=10,random_state=420)
    model = RandomForestClassifier(n_estimators=300,max_depth=20,verbose=0,n_jobs=11,random_state=420,max_features=None)
    """
    model = lightgbm.LGBMClassifier(
    objective='multiclass',
    boosting='dart',
    #learning_rate = 0.1,
    #max_depth = 20,
    n_jobs=-2,
    silent=True,
    random_state=420,
    #num_leaves = 400,
    #n_estimators = 400,
    #bagging_fraction = 0.8,
    #feature_fraction = 0.9
    )
    """

    model.fit(X_t, y_t)

    preds = model.predict(X_d)

    acc = accuracy_score(y_d,preds)
    f1 = f1_score(y_d,preds,average='macro')
    top = top10_accuracy_scorer(model, X_d, y_d)

    print('ACC: ',acc)
    print('F1: ', f1)
    print('TOP-10: ',top)

    test_preds.append(model.predict_proba(test))

0%|          | 0/5 [00:00<?, ?it/s]
--------FOLD  1
ACC:  0.4441808747220163
F1:  0.1705129295375663
TOP-10:  0.5824314306893995
 20%|██        | 1/5 [1:15:01<5:00:05, 4501.44s/it]
--------FOLD  2


## Submission

In [9]:
test_preds = np.array(test_preds)
probas=np.mean(test_preds,axis=0)
probas.shape

(18816, 1314)

In [15]:
submission_format = pd.read_csv('../data/submission_format.csv', index_col='sequence_id')

In [16]:
assert submission_format.shape == probas.shape
assert (model.classes_ == submission_format.columns).all()

In [17]:
my_submission = pd.DataFrame(data=probas, 
                             columns=model.classes_, 
                             index=submission_format.index)

In [18]:
my_submission.head()

Unnamed: 0_level_0,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,0CL7QVG8,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E0VFT,3e-06,4.6e-05,8.401008e-07,0.0,0.001139,2e-05,0.001222,1.6e-05,2.8e-05,0.0,...,4e-06,7e-05,3.145652e-07,0.0,5.7e-05,5e-06,0.001009,2.4e-05,4.003635e-07,9.6e-05
TTRK5,5.7e-05,9.9e-05,5.862148e-05,0.0,0.000302,3e-05,8.9e-05,9.9e-05,0.000403,0.0,...,6.3e-05,5.9e-05,1.930482e-05,0.0,0.000222,1e-05,0.000117,0.000478,1.332598e-05,6.7e-05
2Z7FZ,0.000181,0.000434,0.0001958461,0.0,0.000836,0.000427,0.001064,0.000524,0.000931,0.0,...,0.000341,7.7e-05,9.226794e-05,0.0,0.00111,5.2e-05,0.000209,0.001889,3.911472e-05,0.000717
VJI6E,2.1e-05,0.0,0.0,0.001045,0.000354,0.0,0.0,0.0,0.018758,5.1e-05,...,0.0,0.0,8.177017e-05,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.000151
721FI,0.000305,0.001219,0.0003603261,0.0,0.002239,0.00016,0.000575,0.000927,0.002196,0.0,...,0.000245,0.000127,0.0001239933,0.0,0.00126,2.8e-05,0.000371,0.003329,7.929932e-05,0.000534


In [19]:
my_submission.to_csv('../submissions/submissionX.csv')