In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install scikit-learn
!pip install pandas
!pip install numpy
!pip install tensorflow-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 4.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 71.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 80.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.

In [3]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf

from collections import defaultdict
from sklearn.model_selection import train_test_split
from transformers import TFBertModel, BertTokenizerFast, create_optimizer
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, AutoConfig, BertModel, RobertaForSequenceClassification, TFRobertaForSequenceClassification, TFXLMRobertaForSequenceClassification
from transformers import create_optimizer

In [4]:
PATH_TO_LABEL_TOP_100 = '/content/drive/MyDrive/Colab Notebooks/data/label.csv'
PATH_TO_FEATURE_VECTOR = '/content/drive/MyDrive/Colab Notebooks/data/feature_vector_received_by.csv'
PATH_TO_TEXT_LIGHT_CLEAN = '/content/drive/MyDrive/Colab Notebooks/data/text_heavy.csv'

nrows = None
idx = None

In [5]:
df_features = pd.read_csv(PATH_TO_FEATURE_VECTOR, sep=',')
df_labels = pd.read_csv(PATH_TO_LABEL_TOP_100, dtype={'id': int, 'label': str, 'label_encoded': int}, sep=',')
df_texts = pd.read_csv(PATH_TO_TEXT_LIGHT_CLEAN, sep=',')
df_texts = df_texts.rename(columns={'requestId': 'id'})

df_texts['text'] = df_texts['subject'] + " " + df_texts['description']

df_features = df_features.fillna('')
df_labels = df_labels.fillna('')
df_texts = df_texts.fillna('')

ids = list(set(df_features.id.to_list()) & set(df_labels.id.to_list()) & set(df_texts.id.to_list()))
ids = np.unique(np.array(ids))

df_features = df_features[df_features.id.isin(ids)]
df_labels = df_labels[df_labels.id.isin(ids)]
df_texts = df_texts[df_texts.id.isin(ids)]

df_features = df_features.drop_duplicates(subset='id')
df_labels = df_labels.drop_duplicates(subset='id')
df_texts = df_texts.drop_duplicates(subset='id')

df_features = df_features.sort_values(by='id')
df_labels = df_labels.sort_values(by='id')
df_texts = df_texts.sort_values(by='id')

# arr_subject = df_texts[['subject']].to_numpy()
# arr_description = df_texts[['description']].to_numpy()

arr_labels = df_labels[['label_encoded']].to_numpy()
arr_description = df_texts[['text']].to_numpy()

arr_features = df_features.drop(['id'], axis=1)
arr_features = arr_features.values

if nrows is not None:
    idx = random.sample(range(0, len(arr_labels)), nrows)

if idx is not None:
    arr_labels = [arr_labels[i] for i in idx]
    arr_description = [arr_description[i] for i in idx]
    arr_features = [arr_features[i] for i in idx]
    # arr_subject = [arr_subject[i] for i in idx]


arr_y = np.asarray(arr_labels).reshape(-1)
arr_x_description = np.asarray(arr_description).reshape(-1)
arr_x_features = np.asarray(arr_features)
# arr_x_subject = np.asarray(arr_subject).reshape(-1) 

In [6]:
train_features, \
validation_features, \
train_description, \
validation_description, \
train_label, \
validation_label = train_test_split(
    # arr_x_subject.tolist(),
    arr_x_features.tolist(),
    arr_x_description.tolist(),
    arr_y.tolist(),
    test_size=.2,
    shuffle=True
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_subject(sentences, max_length=128, padding='max_length'):
    return tokenizer(
        sentences,
        truncation=True,
        padding=padding,
        max_length=max_length,
        return_tensors="tf"
    )


def tokenize_description(sentences, max_length=512, padding='max_length'):
    return tokenizer(
        sentences,
        truncation=True,
        padding=padding,
        max_length=max_length,
        return_tensors="tf"
    )

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [8]:
def get_layer_for_description(MAX_SEQUENCE_LENGTH=512):

    base = TFXLMRobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/XLM-RoBERTa")

    # Inputs for token indices and attention masks
    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='description.input_ids')
    attention_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='description.attention_mask')

    # for layer in base.layers:
    #    layer.trainable = False

    # output = base([input_ids, attention_mask]).last_hidden_state[:, 0, :]
    output = base([input_ids, attention_mask])[0]
    output = tf.keras.layers.Dropout(rate=0.1)(output)
    

    return input_ids, attention_mask, output


def get_layer_for_subject(MAX_SEQUENCE_LENGTH=128):

    base = TFXLMRobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/XLM-RoBERTa")

    # Inputs for token indices and attention masks
    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='subject.input_ids')
    attention_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='subject.attention_mask')

    # for layer in base.layers:
    #    layer.trainable = False

    # output = base([input_ids, attention_mask]).last_hidden_state[:, 0, :]
    output = base([input_ids, attention_mask])[0]
    output = tf.keras.layers.Dropout(rate=0.1)(output)

    return input_ids, attention_mask, output

In [None]:
tokenized_train = defaultdict(list)
tokenized_train['features'] = tf.convert_to_tensor(train_features)
tokenized_train['description.input_ids'] = dict(tokenize_description(train_description))['input_ids']
tokenized_train['description.attention_mask'] = dict(tokenize_description(train_description))['attention_mask']
# tokenized_train['subject.input_ids'] = dict(tokenize_subject(train_subject))['input_ids']
# tokenized_train['subject.attention_mask'] = dict(tokenize_subject(train_subject))['attention_mask']

tokenized_validation = defaultdict(list)
tokenized_validation['features'] = tf.convert_to_tensor(validation_features)
tokenized_validation['description.input_ids'] = dict(tokenize_description(validation_description))['input_ids']
tokenized_validation['description.attention_mask'] = dict(tokenize_description(validation_description))['attention_mask']
# tokenized_validation['subject.input_ids'] = dict(tokenize_subject(validation_subject))['input_ids']
# tokenized_validation['subject.attention_mask'] = dict(tokenize_subject(validation_subject))['attention_mask']

In [None]:
# input_ids_subject, attention_mask_subject, output_subject = get_layer_for_subject()
input_ids_description, attention_mask_description, output_description = get_layer_for_description()

In [None]:

features_layer = tf.keras.layers.Input(shape=(len(train_features[0]),), dtype=tf.float64, name='features')

features = tf.keras.layers.Dense(units=len(train_features[0]), activation='softmax')(features_layer)
# features = tf.keras.layers.Dropout(.2)(features)
# features = tf.keras.layers.Dense(units=16, activation='softmax')(features)
# features = tf.keras.layers.Dropout(.2)(features)
# features = tf.keras.layers.Dense(units=16, activation='softmax')(features)
# features = tf.keras.layers.Dropout(.2)(features)
# features = tf.keras.layers.Dense(units=16, activation='softmax')(features)
# features = tf.keras.layers.Dropout(.2)(features)
# features = tf.keras.layers.Dense(units=16, activation='softmax')(features)
# features = tf.keras.layers.Dropout(.2)(features)

# output = tf.keras.layers.concatenate([features, output_subject, output_description])
# output = tf.keras.layers.concatenate([output_subject, output_description])
# output = tf.keras.layers.concatenate([output_description])

output = tf.keras.layers.concatenate([features, output_description])
output = tf.keras.layers.Dense(units=100, activation='softmax')(output)

model = tf.keras.Model(
    inputs=[features_layer, input_ids_description, attention_mask_description],
    # inputs=[input_ids_description, attention_mask_description],
    outputs=[output],
)

In [None]:
BATCH_SIZE = 32
NUM_EPOCHS = 10

batches_per_epoch = len(train_label) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * NUM_EPOCHS)
optimizer, _ = create_optimizer(init_lr=2e-5, num_warmup_steps=10000, num_train_steps=total_train_steps)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=optimizer,
    metrics=['accuracy'],
)

model.summary()

X = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_train),
    train_label
)).batch(BATCH_SIZE).prefetch(1)

V = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_validation),
    validation_label
)).batch(BATCH_SIZE).prefetch(1)

In [None]:
model.fit(
    x=X,
    y=None,
    epochs=NUM_EPOCHS,
    validation_data=V,
)

model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/XLM-RoBERTa Ensemble', overwrite=True)