In [None]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import cv2
import pandas as pd
import re
import torch
import transformers

import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
googleColab = True

if googleColab:
  from google.colab import drive
  drive.mount('/content/drive')
  # CHANGE THIS PATH IF USING COLAB
  %cd "/content/drive/MyDrive/Msc Artificial Intelligence/Semester 1/Applied Artificial Intelligence/Assignment"

Mounted at /content/drive
/content/drive/MyDrive/Msc Artificial Intelligence/Semester 1/Applied Artificial Intelligence/Assignment


In [None]:
# Loads the twitter dataset into pandas dataframe
dataframe = pd.read_csv("./Dataset/training.csv",
                        encoding='ISO-8859-1',
                        names=['target','ids','date','flag','user','tweet'])

In [None]:
# Changes the target value for postive from 4 to 1 so it is more compatible with binary classification
dataframe['target'] = np.where(dataframe['target'] == 4, 1, 0)

In [None]:
# Fractions 100,000 data points from the dataframe
dataframe = dataframe.sample(frac = 0.0625, random_state = 2)

In [None]:
dataframe

Unnamed: 0,target,ids,date,flag,user,tweet
520816,0,2192225041,Tue Jun 16 06:42:38 PDT 2009,NO_QUERY,Calvinrockstar,Chris just farted in the UJFM studio... thats ...
21413,0,1557274263,Sun Apr 19 03:04:38 PDT 2009,NO_QUERY,georgeharito,Grrr can't center a div in Safari for some rea...
394704,0,2055698901,Sat Jun 06 09:47:41 PDT 2009,NO_QUERY,Kristinanana,is wearing gloves inside as she is so coldddd
1029363,1,1932761306,Tue May 26 22:09:34 PDT 2009,NO_QUERY,Scyranth,@smackthis hey baby!
1376248,1,2051721356,Fri Jun 05 22:38:44 PDT 2009,NO_QUERY,tini_oreo,@mileycyrus can't wait to hear the full song o...
...,...,...,...,...,...,...
504904,0,2188253651,Mon Jun 15 21:33:55 PDT 2009,NO_QUERY,icstephaniex,"@brianlee87 im done packing, but i have to cle..."
970567,1,1828046615,Sun May 17 11:58:57 PDT 2009,NO_QUERY,asimplicity,Time to work on the yard. Will be back later
721199,0,2261118837,Sat Jun 20 20:17:30 PDT 2009,NO_QUERY,yourfavealison,going to pick @surahurvey up frm work.... i mi...
491331,0,2183788671,Mon Jun 15 14:59:19 PDT 2009,NO_QUERY,ellietricity,why does keeping trim have to be so bloody lon...


##Pre-Processing

In [None]:
from textPreProcessing import DataProcessor
# Pre-processes the text using my .py script
cleaner = DataProcessor()

dataframe["tweet"] = cleaner.CleanTextData(dataframe["tweet"])

##Transformer Embedding Generation

### Distill Bert

In [None]:
# Loads the distill bert model and tokenizer from hugging face to use for embedding generation
modelClass, tokenizerClass, pretrainedWeights = (transformers.DistilBertModel, transformers.DistilBertTokenizer, 'distilbert-base-uncased')

distillTokenizer = tokenizerClass.from_pretrained(pretrainedWeights)
distillModel = modelClass.from_pretrained(pretrainedWeights)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Tokenizes the tweets
distillTokenized = dataframe['tweet'].apply(lambda x: distillTokenizer.encode(x, add_special_tokens=True, truncation=True, padding='max_length', max_length=512))

In [None]:
# Assigns the tokenized tweets as inputs and attention masks for the tranformer
input_ids = torch.tensor(distillTokenized.values.tolist())
attention_mask = torch.tensor(np.where(input_ids == 0, input_ids, 1))
labels = dataframe["target"].values.tolist()

In [None]:
def runOneInferenceDistill(inputIds, attentionMask, labels, fileCount):

  # Sends the inputIds and attention mask to the GPU
  inputIds = inputIds.cuda()
  attentionMask = attentionMask.cuda()

  # Runs the distillbert model in evaluation mode and grabs the resulting CLS tokens
  distillModel.eval()
  with torch.no_grad():
    clsTokens = distillModel(inputIds, attention_mask=attentionMask)[0][:, 0, :].cpu().numpy()

  # Saves the labels and batch to a folder
  np.save(f"./datasetBothModels/distillBert/batches/batch{fileCount}.npy",
           clsTokens)

  np.save(f"./datasetBothModels/distillBert/batches/batch{fileCount}labels.npy",
          labels)

  clsTokens = None

In [None]:
# Loops round the 100,000 data points in increments of 256 batches generating word embeddings
batchSize = 256
distillModel = distillModel.cuda()
for i in range(int(input_ids.shape[0]/batchSize)):

  runOneInferenceDistill(input_ids[i*batchSize : (i+1)*batchSize], attention_mask[i*batchSize : (i+1)*batchSize], labels[i*batchSize : (i+1)*batchSize], i)

### MINILM

In [None]:
# Imports the miniLM model and tokenizer from hugging face

miniLMTokenizer = transformers.AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
miniLMModel = transformers.AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

In [None]:
# Grabs the labels from the dataset
labels = dataframe["target"].values.tolist()

In [None]:
def runOneInferenceMiniLM(inputs, labels, fileCount):

  # Runs the mini lm model in evaluation mode and grabs the cls tokens
  miniLMModel.eval()
  with torch.no_grad():
    clsTokens = miniLMModel(**inputs)[0][:, 0, :].cpu().numpy()

  # Saves the word embeddings and labels
  np.save(f"./datasetBothModels/miniLM/batches/batch{fileCount}.npy",
           clsTokens)

  np.save(f"./datasetBothModels/miniLM/batches/batch{fileCount}labels.npy",
          labels)

  clsTokens = None

In [None]:
# Iterates over the dataset in batches of 1024 saving the outputs
batchSize = 1024
miniLMModel = miniLMModel.cuda()

# converts the tweets to a list for the tokenizer
data = dataframe['tweet'].tolist()

for i in range(int(100000/batchSize)):
  # Tokenizes thet tweets and send them to the gpu
  inputs = miniLMTokenizer(data[i*batchSize : (i+1)*batchSize], add_special_tokens=True, truncation=True, padding='max_length', max_length=256, return_tensors='pt').to('cuda')
  runOneInferenceMiniLM(inputs, labels[i*batchSize : (i+1)*batchSize], i)