# Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 300)
pd.get_option("display.max_rows")
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
%matplotlib inline

import os
from os.path import join as opj
import gc

INPUT_PATH = '../../input/feedback-prize-2021/train.csv'

In [2]:
train_df = pd.read_csv(INPUT_PATH).rename(columns={'id':'essay_id'})
print('train_df.shape = ', train_df.shape)

train_df.shape =  (144293, 8)


# Create Folds

In [3]:
train_df.head()

Unnamed: 0,essay_id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [4]:
unique_ids = np.array(sorted(train_df['essay_id'].unique()))
unique_ids

array(['0000D23A521A', '00066EA9880D', '000E6DE9E817', ...,
       'FFF868E06176', 'FFFD0AF13501', 'FFFF80B8CC2F'], dtype='<U12')

In [5]:
import numpy as np
from sklearn.model_selection import KFold

def get_fold_ids_list(n_folds, ids, seed):
    kfold = KFold(n_splits=n_folds,
                  shuffle=True,
                  random_state=seed)
    trn_ids_list = []
    val_ids_list = []
    for trn_idx, val_idx in kfold.split(ids,ids):
        trn_ids_list.append(ids[np.array(trn_idx)])
        val_ids_list.append(ids[np.array(val_idx)])
    return trn_ids_list, val_ids_list

In [6]:
N_FOLDS = 5
SEED = 2022

trn_ids_list, val_ids_list = get_fold_ids_list(n_folds=N_FOLDS,
                                               ids=unique_ids,
                                               seed=SEED)

In [7]:
import joblib
import os

os.makedirs('./result', exist_ok=True)
joblib.dump(trn_ids_list, './result/trn_ids_list.joblib')
joblib.dump(val_ids_list, './result/val_ids_list.joblib')

['./result/val_ids_list.joblib']

In [8]:
for fold in range(N_FOLDS):
    print('fold = ', fold)
    trn_df = train_df[train_df['essay_id'].isin(trn_ids_list[fold])].reset_index()
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index()
    print('trn_df.shape = {}, val_df.shape = {}'.format(trn_df.shape, val_df.shape))
    print('')

fold =  0
trn_df.shape = (115526, 9), val_df.shape = (28767, 9)

fold =  1
trn_df.shape = (115331, 9), val_df.shape = (28962, 9)

fold =  2
trn_df.shape = (115226, 9), val_df.shape = (29067, 9)

fold =  3
trn_df.shape = (115576, 9), val_df.shape = (28717, 9)

fold =  4
trn_df.shape = (115513, 9), val_df.shape = (28780, 9)



In [9]:
for fold in range(N_FOLDS):
    print('fold = ', fold)
    val_df = train_df[train_df['essay_id'].isin(val_ids_list[fold])].reset_index()
    display(val_df['discourse_type'].value_counts())
    print('')

fold =  0


Claim                   10033
Evidence                 9097
Position                 3081
Concluding Statement     2696
Lead                     1843
Counterclaim             1154
Rebuttal                  863
Name: discourse_type, dtype: int64


fold =  1


Claim                   10071
Evidence                 9132
Position                 3087
Concluding Statement     2739
Lead                     1868
Counterclaim             1175
Rebuttal                  890
Name: discourse_type, dtype: int64


fold =  2


Claim                   10036
Evidence                 9200
Position                 3082
Concluding Statement     2694
Lead                     1877
Counterclaim             1236
Rebuttal                  942
Name: discourse_type, dtype: int64


fold =  3


Claim                   10011
Evidence                 9084
Position                 3081
Concluding Statement     2674
Lead                     1875
Counterclaim             1147
Rebuttal                  845
Name: discourse_type, dtype: int64


fold =  4


Claim                   10057
Evidence                 9189
Position                 3088
Concluding Statement     2702
Lead                     1842
Counterclaim             1105
Rebuttal                  797
Name: discourse_type, dtype: int64


