# Imports

In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score

import tensorflow as tf

from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

curr_dir = os.path.dirname(os.path.realpath('__file__'))
proj_dir = os.path.abspath(os.path.join(curr_dir, '..'))

sys.path.append(proj_dir)

from src.configuration import load_config

def seed_everything(seed=2023):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything()

cfg = load_config()

2023-08-19 15:21:42.025176: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 15:21:56.282021: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 15:21:56.328747: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Read Data and Create Folds

In [2]:
class NbCfg:
    lang = 'assamese'
    prefix_lang = 'A'
    text_col = 'text'
    target_col = 'task_1'
    apex = True
    model_name = 'bert-base-multilingual-cased'
    seed = 42
    n_splits = 5
    max_len = 256
    dropout = 0.3
    target_size=2
    n_accumulate=1
    print_freq = 250
    min_lr=1e-7
    scheduler = 'cosine'
    batch_size = 16
    num_workers = 3
    lr = 5e-5
    weigth_decay = 0.02
    epochs = 5
    train = True 
    num_warmup_steps = 0
    num_cycles=0.5
    debug = True

In [3]:
train_df = pd.read_csv(os.path.join(cfg['data']['inp'], f'{NbCfg.lang}/train_{NbCfg.prefix_lang}_AH_HASOC2023.csv'))
test_df = pd.read_csv(os.path.join(cfg['data']['inp'], f'{NbCfg.lang}/test_{NbCfg.prefix_lang}_AH_HASOC2023.csv'))
sub_df = pd.read_csv(os.path.join(cfg['data']['inp'], f'{NbCfg.lang}/sample.csv'))

if NbCfg.debug:
    train_df = train_df.sample(100)

In [4]:
# convert target to labels
tar2num = {'HOF': 0, 'NOT': 1}
num2tar = {0: 'HOF', 1: 'NOT'}

train_df[NbCfg.target_col] = train_df[NbCfg.target_col].map(tar2num)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(NbCfg.model_name, from_pt=True)
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
class HasocDataLoader(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.max_len = max_length
        self.text = df[NbCfg.text_col].values
        self.tokenizer = tokenizer
        self.is_train = is_train
        if is_train:
            self.targets = df[NbCfg.target_col].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length = self.max_len
        )
        return {
            'input_ids':inputs['input_ids'],
            'attention_mask':inputs['attention_mask'],
            'target':self.targets[index]
            } if self.is_train else {
            'input_ids':inputs['input_ids'],
            'attention_mask':inputs['attention_mask'],
        }

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df[NbCfg.target_col])):
    print('\n')
    print(80*'=', f'FOLD: {fold}', 80*'=')

    trn_df = train_df.iloc[train_idx]
    val_df = train_df.iloc[val_idx]

    train_ds = HasocDataLoader(train_df, tokenizer, NbCfg.max_len)
    val_ds = HasocDataLoader(val_df, tokenizer, NbCfg.max_len)











