# Importing necessary libraries

In [19]:
import numpy as np
import pandas as pd
import math
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model

#### Reading in the files from the folder

In [7]:
auto_df = pd.read_csv("files/autos.csv")
career_df = pd.read_csv("files/career.csv")
education_df = pd.read_csv("files/education.csv")
health_df = pd.read_csv("files/health.csv")

  auto_df = pd.read_csv("files/autos.csv")


#### Labelling each according to the classes

In [8]:
auto_df["label"] = 'automobiles'
career_df["label"] = "careers"
education_df["label"] = "education"
health_df["label"] = "health"

#### Making a copy of the data in case something goes wrong during the process of building a model

In [9]:
auto_df_1 = auto_df.copy()
career_df_1 = career_df.copy()
education_df_1 = education_df.copy()
health_df_1 = health_df.copy()

#### Extracting the needed labels from each data

In [10]:
auto_df_1 = auto_df_1[["Text", "label"]]
career_df_1 = career_df_1[["Text", "label"]]
education_df_1 = education_df_1[["Text", "label"]]
health_df_1 =health_df_1[["Text", "label"]]

#### Combining all the data together to form a single data to be used for both training and testing

In [11]:
full_df = pd.concat([auto_df_1, career_df_1, education_df_1, health_df_1], axis=0)

In [12]:
print(full_df.info())
full_df.describe()
full_df.dropna(how='any',inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 915854 entries, 0 to 155017
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Text    914337 non-null  object
 1   label   915854 non-null  object
dtypes: object(2)
memory usage: 21.0+ MB
None


In [20]:
x = full_df['Text']
y = full_df['label']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [22]:
MAX_LENGTH = math.ceil((x_train.apply(lambda x: len(str(x).split())).mean()))+2
MAX_LENGTH

133

## Adding a token [PAD_TOKEN]  and an End Of Line token [EOS_TOKEN]

In [None]:
PAD_TOKEN = "<|pad|>"
EOS_TOKEN = "<|endoftext|>"

# this will download and initialize the pre trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2",
    pad_token=PAD_TOKEN,
    eos_token=EOS_TOKEN,
    max_length=MAX_LENGTH,
    is_split_into_words=True)

## Adding the EOS at the end of each text

In [None]:
x_train = [str(ex) + EOS_TOKEN for ex in x_train]
x_test = [str(ex) + EOS_TOKEN for ex in x_test]

## Passing them on to the tokenizer and padding them to the max length

In [None]:
x_train_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)['input_ids'] for x in x_train]
x_test_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)['input_ids'] for x in x_test]

x_train_in = tf.squeeze(tf.convert_to_tensor(x_train_), axis=1)
x_test_in = tf.squeeze(tf.convert_to_tensor(x_test_), axis=1)

In [None]:
x_train_mask_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)["attention_mask"] for x in x_train]
x_test_mask_ = [tokenizer(str(x), return_tensors='tf', max_length=MAX_LENGTH, truncation=True, pad_to_max_length=True, add_special_tokens=True)["attention_mask"] for x in x_test]

x_train_mask = tf.squeeze(tf.convert_to_tensor(x_train_mask_), axis=1)
x_test_mask = tf.squeeze(tf.convert_to_tensor(x_test_mask_), axis=1)

## Building the model

In [None]:
model = TFGPT2Model.from_pretrained("gpt2", use_cache=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id)
model.training = True

In [None]:
model.resize_token_embeddings(len(tokenizer))

## Set the GPT2 pre-trained layers as non trainable

In [None]:
for layer in model.layers:
    layer.trainable = False

## Model Summary

In [None]:
model.summary()

# Building on GPT2. 

### The model takes in tokens and mask tensors. The outputs are the last hidden states of the last layer in the transformer. These are reduced using the mean over the sequence length, passed through 2 dense layers with dropout in between. 

### The output layer has three nodes, (softmax activation function for probabilities) for the four(4) classes we want to predict (Health, Education, Career and Automobile).

In [None]:
input = tf.keras.layers.Input(shape=(None,), dtype='int32')
mask = tf.keras.layers.Input(shape=(None,), dtype='int32')
x = model(input, attention_mask=mask)
x = tf.reduce_mean(x.last_hidden_state, axis=1)
x = tf.keras.layers.Dense(16, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
output = tf.keras.layers.Dense(4, activation='softmax')(x)

In [None]:
clf = tf.keras.Model([input, mask], output)

In [None]:
clf.summary()

# Model compilation

### Compiling the model, choosing the learning rate, loss function and the metric to monitor and also a callback function.

In [None]:
base_learning_rate = 0.0005
optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate)
#loss=tf.keras.losses.BinaryCrossentropy()
loss=tf.keras.losses.SparseCategoricalCrossentropy()

clf.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
callbacks = tf.keras.callbacks.EarlyStopping(
        monitor="accuracy", verbose=1, patience=3, restore_best_weights=True)

# The target tensors

In [25]:
def map_sentiment(value):
    if value == 'automobiles':
        return 0
    if value == 'careers':
        return 1
    if value == 'education':
        return 2
    if value == "health":
        return 3

In [27]:
# Applying the function above to the y_train and x_train
y_train_ = y_train.map(map_sentiment)
y_test_ = y_test.map(map_sentiment)

In [None]:
y_train_in = tf.constant(y_train_, dtype=tf.int32)
y_test_in = tf.constant(y_test_, dtype=tf.int32)

In [None]:
tf.config.experimental_run_functions_eagerly(True)

# Training the model and passing the number of epochs, batch_size and validation split.

In [None]:
history = clf.fit([X_train_in, X_train_mask], y_train_in, epochs=30, batch_size=32, validation_split=0.2, callbacks=callbacks)

# Model Evaluation

In [None]:
clf.evaluate([X_test_in, X_test_mask], y_test_in)

In [None]:
clf.training = False
y_pred = clf.predict([X_test_in, X_test_mask])

In [None]:
y_pred_out = tf.math.argmax(y_pred, axis=-1)
y_pred_out

# Testing the model on the test dataset

### Obtaining the classification report

In [None]:
print(classification_report(y_test_in, y_pred_out))

# Obtaining the CONFUSION MATRIX 
# Plotting it using a heatmap for better visualization of the model

In [None]:
confusion_df = pd.DataFrame(confusion_matrix(y_test_in, y_pred_out))
confusion_df.index = ['Actual -1', 'Actual 0', 'Actual 1']
confusion_df.columns = ['Predicted -1', 'Predicted 0', 'Predicted 1']
confusion_df

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(confusion_df, annot=True, fmt='d', linewidths=0.5) 
plt.yticks(rotation=0)
plt.xticks(rotation=45)
plt.show()