In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, f1_score

import keras
import tensorflow as tf
import keras.layers as L
from keras import layers
import keras.backend as K
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.losses import binary_crossentropy
from keras.utils import pad_sequences
from keras.layers import Layer, Dropout, Dense, Input, Embedding, Bidirectional, LSTM, Concatenate

curr_dir = os.path.dirname(os.path.realpath('__file__'))
proj_dir = os.path.abspath(os.path.join(curr_dir, '..'))

sys.path.append(proj_dir)

from src.configuration import load_config

def seed_everything(seed=2023):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything()

cfg = load_config()
print(cfg)

2023-08-19 12:44:25.626979: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 12:44:33.983418: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-19 12:44:34.034584: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'data': {'inp': './../inp', 'out': './../out'}, 'logs': './../logs', 'hyps': {'n_split': 5, 'max_len': {'assamese': 256}, 'random_state': [42, 2023]}}


In [2]:
train_df = pd.read_csv(os.path.join(cfg['data']['inp'], 'assamese/train_A_AH_HASOC2023.csv'))
test_df = pd.read_csv(os.path.join(cfg['data']['inp'], 'assamese/test_A_AH_HASOC2023.csv'))
sub_df = pd.read_csv(os.path.join(cfg['data']['inp'], 'assamese/sample.csv'))

In [3]:
MAXLEN = 256
VOCAB_SIZE = 20000
EMBED_DIM = 128
BATCH_SIZE = 128
EPOCHS = 5
CLASSES = 2

TEXT_COL = "text"
TARGET_COL = "task_1"

In [4]:
print("Length of train: ", len(train_df))
print("Length of test: ", len(test_df))

tar2num = {'HOF' : 0, 'NOT' : 1}
num2tar = {0 : 'HOF', 1 : 'NOT'}

tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters='!"#$&(),.:;?@[\\]^_`{|}\t\n')
tokenizer.fit_on_texts(list(train_df[TEXT_COL]) + list(test_df[TEXT_COL]))
word_idx = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(list(train_df[TEXT_COL]))
y_train = train_df[TARGET_COL].map(tar2num)

X_test = tokenizer.texts_to_sequences(list(test_df[TEXT_COL]))

X_train = pad_sequences(X_train, maxlen=MAXLEN)
X_test = pad_sequences(X_test, maxlen=MAXLEN)

Length of train:  4036
Length of test:  1009


In [5]:
class AttentionBlock(keras.Model):

    def __init__(self, units):
        super(AttentionBlock, self).__init__()
        self.W1 = Dense(units=units)
        self.W2 = Dense(units=units)
        self.V = Dense(1)

    def call(self, features, hidden):
        hidden_t = K.expand_dims(hidden, 1)
        # additive attention
        score = K.tanh(self.W1(features) + self.W2(hidden_t))

        attn_weights = K.softmax(self.V(score), axis=1)

        context = attn_weights * features
        context = tf.reduce_sum(context, axis=1)

        return context, attn_weights
        pass

    pass


def build_model(max_len, max_features, embed_size, attn_units=20, num_classes=4, rnn_cell_size=32):
    seq_inp = Input(shape=max_len, dtype="int32")
    embedded_seq = Embedding(max_features, embed_size)(seq_inp)
    lstm = Bidirectional(LSTM(
        rnn_cell_size, return_sequences=True
    ), name="bilstm_0")(embedded_seq)

    lstm, f_h, f_c, b_h, b_c = Bidirectional(LSTM(
        rnn_cell_size, return_sequences=True, return_state=True
    ), name="bilstm_1")(lstm)

    h_ = Concatenate()([f_h, b_h])
    c_ = Concatenate()([f_c, b_c])

    context, attn_weights = AttentionBlock(attn_units)(lstm, h_)

    fc_pre = Dense(num_classes * 4, activation="relu")(context)
    do = Dropout(0.05)(fc_pre)
    output = Dense(1, activation="sigmoid")(do)

    return keras.Model(inputs=seq_inp, outputs=output)
    pass

In [6]:
model = build_model(max_len=MAXLEN, max_features=VOCAB_SIZE, embed_size=EMBED_DIM, num_classes=CLASSES)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
             optimizer="adam",
             metrics=[tf.keras.metrics.binary_crossentropy])
model.summary()

2023-08-19 12:46:15.226977: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-19 12:46:15.693371: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 256, 128)             2560000   ['input_1[0][0]']             
                                                                                                  
 bilstm_0 (Bidirectional)    (None, 256, 64)              41216     ['embedding[0][0]']           
                                                                                                  
 bilstm_1 (Bidirectional)    [(None, 256, 64),            24832     ['bilstm_0[0][0]']            
                              (None, 32),                                                     

In [7]:
cv_splits = list(StratifiedKFold(n_splits=5).split(X_train, y_train))

oof_preds = np.zeros((X_train.shape[0],))
test_preds = np.zeros((X_test.shape[0],))

for fold in range(5):
    K.clear_session()
    train_idx, val_idx = cv_splits[fold]
    print("\n======FOLD {}=====".format(fold))
    print()
    model.fit(X_train[train_idx], y_train[train_idx],
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                validation_data=(X_train[val_idx], y_train[val_idx]))

    oof_preds[val_idx] += model.predict(X_train[val_idx])[:, 0]
    test_preds += model.predict(X_test)[:, 0]



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
print("Training Finished...")
print("Performance in training Data...")
oof_pred = tf.math.greater_equal(oof_preds, 0.5)
val_f1_score = f1_score(y_true=y_train, y_pred=oof_pred, average='macro')
print("F1 Score for Training: ", val_f1_score)
print("Classification report for training: \n", classification_report(y_true=y_train, y_pred=oof_pred))

Training Finished...
Performance in training Data...
F1 Score for Training:  0.9079356826428233
Classification report for training: 
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      2347
           1       0.88      0.91      0.89      1689

    accuracy                           0.91      4036
   macro avg       0.91      0.91      0.91      4036
weighted avg       0.91      0.91      0.91      4036



In [9]:
y_preds = test_preds / 5

print("Evaluation on test data...")
y_pred = tf.math.greater_equal(y_preds, 0.5).numpy().astype(int)

Evaluation on test data...


In [10]:
test_df['task_1'] = y_pred
test_df['task_1'] = test_df['task_1'].map(num2tar)
test_df['task_1'].value_counts()

task_1
HOF    589
NOT    420
Name: count, dtype: int64

In [11]:
test_df[['S. No.', 'task_1']].to_csv(f'./../out/baseline_lstm_submission_{val_f1_score}localF1.csv', index=False)