In [1]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import collections

import os

In [2]:
X_train = pd.read_csv('../data/raw/train_values.csv')
y_train = pd.read_csv('../data/raw/train_labels.csv')
X_test = pd.read_csv('../data/raw/test_values.csv')
print('Train: ',X_train.shape)
print('Train labels: ', y_train.shape)
print('Test: ',X_test.shape)

Train:  (63017, 41)
Train labels:  (63017, 1315)
Test:  (18816, 41)


In [3]:
lab_ids = pd.DataFrame(y_train.drop('sequence_id',axis=1,inplace=False).idxmax(axis=1), columns=['lab_id']).lab_id.values.ravel()

print(lab_ids.shape)

(63017,)


In [4]:
# Replicate obs of labs with < n obs
duplicate_th = 10


counts = collections.Counter(lab_ids)
X_train['target'] = lab_ids

replicate = []
for i,row in X_train.iterrows():
    tmp = duplicate_th - counts[row.target]
    if tmp > 0:
        s_id = row.sequence_id
        for j in range(1,tmp):
            new_row = row.copy()
            new_row[0] = s_id+'_'+str(j)
            replicate.append(new_row)
X_train = X_train.append(replicate)


print(X_train.shape)
print(X_train.sequence_id.duplicated().any())

(67447, 42)
False


In [5]:
X_train['sequence_len'] = X_train.sequence.apply(len)
X_test['sequence_len'] = X_test.sequence.apply(len)

print(X_train['sequence_len'].describe())
print(X_test['sequence_len'].describe())

count    67447.000000
mean      4867.924785
std       3895.161052
min         20.000000
25%        910.000000
50%       4795.000000
75%       7506.000000
max      60099.000000
Name: sequence_len, dtype: float64
count    18816.000000
mean      4875.523810
std       4004.117614
min         19.000000
25%        894.750000
50%       4732.000000
75%       7341.250000
max      38638.000000
Name: sequence_len, dtype: float64


In [6]:
X_test.to_csv('../data/processed/test.csv',index=False)
X_train.to_csv('../data/processed/train.csv',index=False)

In [7]:
X_train.reset_index(inplace=True)

In [9]:
K = [3,5,10]

y = X_train['target']
X = X_train.drop('target',axis=1,inplace=False)

for k in K:
    print('K = ',k)

    path = '../data/folds/'+str(k)
    os.makedirs(path)

    sss = StratifiedKFold(n_splits=k, random_state=420,shuffle=True)

    for i, (train_index, dev_index) in enumerate(sss.split(X,y)):
        print('--------FOLD ',i+1)
        X_t, X_d = X.iloc[train_index], X.iloc[dev_index]
        y_t, y_d = y[train_index], y[dev_index]

        X_t.to_csv(path+'/X_train_split_'+str(i+1)+'.csv',index=False)
        X_d.to_csv(path+'/X_dev_split_'+str(i+1)+'.csv',index=False)
        y_t.to_csv(path+'/y_train_split_'+str(i+1)+'.csv',index=False)
        y_d.to_csv(path+'/y_dev_split_'+str(i+1)+'.csv',index=False)

K =  3
--------FOLD  1
--------FOLD  2
--------FOLD  3
K =  5
--------FOLD  1
--------FOLD  2
--------FOLD  3
--------FOLD  4
--------FOLD  5
K =  10
--------FOLD  1
--------FOLD  2
--------FOLD  3
--------FOLD  4
--------FOLD  5
--------FOLD  6
--------FOLD  7
--------FOLD  8
--------FOLD  9
--------FOLD  10
