<a href="https://colab.research.google.com/github/Will170393/MSc-Project---Stance-Detection/blob/master/(view)_msc_project_bert_processing_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing libraries**

In [0]:
# libraries for processing data
import numpy as np
import pandas as pd
import re

# libraries for loading files from drive
from google.colab import drive
drive.mount('/content/gdrive')

# libraries for natural language processing
from nltk import FreqDist, word_tokenize
import nltk
nltk.download('punkt')

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Reading data into panda dataframes and merging Stances and Bodies**

In [0]:
# creates dataframes from reading the stance and body csv files from the google drive, train/test set already split
trainStances = pd.read_csv('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_stances.csv')
trainBodies = pd.read_csv('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies.csv')
testStances = pd.read_csv('gdrive/My Drive/Colab Notebooks/MSC_project_data/competition_test_stances.csv')
testBodies = pd.read_csv('gdrive/My Drive/Colab Notebooks/MSC_project_data/competition_test_bodies.csv')

In [0]:
# merges dataframes for article bodies and article stances, inner merge on the Body ID column that appears in both dataframes
def mergeStances_Bodies(stances, bodies):
  return pd.merge(stances, bodies, how='inner', on='Body ID')

In [0]:
# creates merged training and test dataframes
train = mergeStances_Bodies(trainStances, trainBodies)
test = mergeStances_Bodies(testStances, testBodies)

**Data Cleaning**

In [0]:
#removes non-alphabetic characters from strings and make all characters lower case
def data_cleaning(data, col):
  p = re.compile(r'[^\w\s]+')
  data[col] = [p.sub('', str(string)) for string in data[col].tolist()] #uses regex to substitute all non_alphabtic characters with whitespace
  data[col] = [[character.lower() for character in word_tokenize(string)] for string in data[col]] #tokenizes the words and converts characters to lower case
  data[col] = [' '.join(word) for word in data[col]] 

In [0]:
# cleaning dataframes for article Headlines and Bodies
data_cleaning(train, 'Headline')
data_cleaning(train, 'articleBody')
data_cleaning(test, 'Headline')
data_cleaning(test, 'articleBody')

In [0]:
#Load pre-defined stops words file from google drive into a list
f = open('gdrive/My Drive/Colab Notebooks/MSC_project_data/project_stopwords_final.txt', encoding='utf-8-sig')
stop_words = f.read().split('\n')
f.close

In [0]:
#removing stop words from training and test set
def remove_stop_words(data, col, stop_words):
  filtered_sentence = data[col].apply(lambda text: [word for word in text.split() if word not in stop_words]) #all all tokens not in stop words list to filtered_tokens
  data[col] = [' '.join(word) for word in filtered_sentence]                                         

In [0]:
# removing stopwords in dataframes for article Headlines and Bodies
remove_stop_words(train, 'Headline', stop_words)
remove_stop_words(train, 'articleBody', stop_words)
remove_stop_words(test, 'Headline', stop_words)
remove_stop_words(test, 'articleBody', stop_words)

**BERT**

In [0]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 2.8MB/s 
Collecting regex (from pytorch-pretrained-bert)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4e/1b178c38c9a1a184288f72065a65ca01f3154df43c6ad898624149b8b4e0/regex-2019.06.08.tar.gz (651kB)
[K     |████████████████████████████████| 655kB 8.8MB/s 
Building wheels for collected packages: regex
  Building wheel for regex (setup.py) ... [?25l[?25hdone
  Created wheel for regex: filename=regex-2019.6.8-cp36-cp36m-linux_x86_64.whl size=604146 sha256=7d39a2d4b4c0d9870ab6ebb1f32ba7e1936782e70300ee529ad8a70735b5cd6b
  Stored in directory: /root/.cache/pip/wheels/35/e4/80/abf3b33ba89cf65cd262af8a22a5a999cc28fbfabea6b38473
Successfully built regex
Installing collected packages: regex, pytorch-pretrained-ber

In [0]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 414959.03B/s]


In [0]:
# method to ass start and end tokens to each document
def add_special_tokens(data, col):
  sentence = data[col]
  sentence = "[CLS] " + sentence + " [SEP]"
  data[col] = sentence

In [0]:
# add special tokens in dataframes for article Headlines and Bodies
add_special_tokens(train, 'Headline')
add_special_tokens(train, 'articleBody')
add_special_tokens(test, 'Headline')
add_special_tokens(test, 'articleBody')

In [0]:
train.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,[CLS] Police find mass graves with at least '1...,712,unrelated,[CLS] Danny Boyle is directing the untitled fi...
1,[CLS] Seth Rogen to Play Apple’s Steve Wozniak...,712,discuss,[CLS] Danny Boyle is directing the untitled fi...
2,[CLS] Mexico police find mass grave near site ...,712,unrelated,[CLS] Danny Boyle is directing the untitled fi...
3,[CLS] Mexico Says Missing Students Not Found I...,712,unrelated,[CLS] Danny Boyle is directing the untitled fi...
4,[CLS] New iOS 8 bug can delete all of your iCl...,712,unrelated,[CLS] Danny Boyle is directing the untitled fi...


In [0]:
# method to ensure tokenized length of each document is less than 512 tokens
def tokenize_text(data):
  tokenized_heads = []
  tokenized_bodies = []
  for index, row in data.iterrows(): #loop through all rows in the dataframe
    token_list_head = tokenizer.tokenize(row['Headline']) #headline becomes token_list_head
    token_list_body = tokenizer.tokenize(row['articleBody']) #article body becomes token_list_body
    if len(token_list_head) > 512: # if the length of the token list head is more than 512
      sep = token_list_head[-1] #keep the sentence ending value
      token_list_head = token_list_head[:510] #reduce size of sequence
      token_list_head.append(sep) #add token ending value to the end
    if len(token_list_body) > 512: # repeat for token_list_body
      sep = token_list_body[-1]
      token_list_body = token_list_body[:510]
      token_list_body.append(sep)
    tokenized_heads.append(token_list_head) #adds sequences to list defined above
    tokenized_bodies.append(token_list_body)
  return tokenized_heads, tokenized_bodies #return pair of lists

In [0]:
#tokenize text for all documents in the dataframes
tokenized_train_heads, tokenized_train_bodies = tokenize_text(train)

In [0]:
#tokenize text for all documents in the dataframes
tokenized_test_heads, tokenized_test_bodies = tokenize_text(test)

In [0]:
# maps each token in the tokenzied data to a specific index
def add_indices(tokenized_data):
  indexed_tokenis = []
  indexed_tokens = [tokenizer.convert_tokens_to_ids(document) for document in tokenized_data] #create list of indices for all tokens in each document
  return indexed_tokens

In [0]:
# create index_token list for every document in training set
index_tokens_train_heads = add_indices(tokenized_train_heads)
index_tokens_train_bodies = add_indices(tokenized_train_bodies)

In [0]:
# create index token list for every document in test set
index_tokens_test_heads = add_indices(tokenized_test_heads)
index_tokens_test_bodies = add_indices(tokenized_test_bodies)

In [0]:
for tup in zip(tokenized_train_bodies[251], index_tokens_train_bodies[251]):
  print(tup)

In [0]:
# load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

100%|██████████| 407873900/407873900 [00:35<00:00, 11347640.26B/s]


In [0]:
# retrieves the sentence vectors for each document
def get_sentence_vectors(index_token_list, model):
  sentence_vectors = []
  model.eval()
  
  #iterates through all documents in the corpus
  for index_list in index_token_list:
    # required formatting for document input into model, create a list of segement ids the same as each token list
    segments_ids = [1] * len(index_list)
    # crates tensor using each index list
    tokens_tensor = torch.tensor([index_list])
    # creates corresponding tensor for segments ids
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
      encoded_layers, _ = model(tokens_tensor, segments_tensors)
    # code below would be used to retrieve single word embeddings for every word in each document
    
    #token_embeddings = []

    # For each token in the sentence
    #for token_i in range(len(token_list)):

      # Holds 12 layers of hidden states for each token 
      #hidden_layers = [] 

      # For each of the 12 layers...
      #for layer_i in range(len(encoded_layers)):

        # Lookup the vector for `token_i` in `layer_i`
        #vec = encoded_layers[layer_i][batch_i][token_i]

        #hidden_layers.append(vec)

        #token_embeddings.append(hidden_layers)

    sentence_embedding = torch.mean(encoded_layers[11], 1)

    sentence_vectors.append(sentence_embedding)
  
  return sentence_vectors

In [0]:
# retrieve sentence vectors for the training headlines
train_heads_sentence_vectors = get_sentence_vectors(index_tokens_train_heads, model)

In [0]:
# batches for training set
index_tokens_train_bodies_1 = index_tokens_train_bodies[:10000]
index_tokens_train_bodies_2 = index_tokens_train_bodies[10000:20000]
index_tokens_train_bodies_3 = index_tokens_train_bodies[20000:30000]
index_tokens_train_bodies_4 = index_tokens_train_bodies[30000:40000]
index_tokens_train_bodies_5 = index_tokens_train_bodies[40000:]

In [0]:
# batches for test set
index_tokens_test_bodies_1 = index_tokens_test_bodies[:10000]
index_tokens_test_bodies_2 = index_tokens_test_bodies[10000:20000]
index_tokens_test_bodies_3 = index_tokens_test_bodies[20000:]

In [0]:
# saving batches of training set
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_heads_sentence_vectors.npy', heads_sentence_vectors)

In [0]:
# save corresponding sentence vectors
train_bodies_sentence_vectors_1 = get_sentence_vectors(index_tokens_train_bodies_1, model)
bodies_sentence_vectors_1 = [t.numpy() for t in train_bodies_sentence_vectors_1]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_1.npy', bodies_sentence_vectors_1)

In [0]:
bodies_sentence_vectors_2 = get_sentence_vectors(index_tokens_train_bodies_2, model)
train_bodies_sentence_vectors_2 = [t.numpy() for t in bodies_sentence_vectors_2]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_2.npy', train_bodies_sentence_vectors_2)


In [0]:
bodies_sentence_vectors_3 = get_sentence_vectors(index_tokens_train_bodies_3, model)
train_bodies_sentence_vectors_3 = [t.numpy() for t in bodies_sentence_vectors_3]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_3.npy', train_bodies_sentence_vectors_3)

In [0]:
bodies_sentence_vectors_4 = get_sentence_vectors(index_tokens_train_bodies_4, model)
train_bodies_sentence_vectors_4 = [t.numpy() for t in bodies_sentence_vectors_4]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_4.npy', train_bodies_sentence_vectors_4)

In [0]:
bodies_sentence_vectors_5 = get_sentence_vectors(index_tokens_train_bodies_5, model)
train_bodies_sentence_vectors_5 = [t.numpy() for t in bodies_sentence_vectors_5]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_5.npy', train_bodies_sentence_vectors_5)

In [0]:
test_bodies_sentence_vectors_1 = get_sentence_vectors(index_tokens_test_bodies_1, model)
test_bodies_sentence_vectors_1 = [t.numpy() for t in test_bodies_sentence_vectors_1]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_bodies_sentence_vectors_1.npy', test_bodies_sentence_vectors_1)

In [0]:
test_bodies_sentence_vectors_2 = get_sentence_vectors(index_tokens_test_bodies_2, model)
test_bodies_sentence_vectors_2 = [t.numpy() for t in test_bodies_sentence_vectors_2]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_bodies_sentence_vectors_2.npy', test_bodies_sentence_vectors_2)

In [0]:
test_bodies_sentence_vectors_3 = get_sentence_vectors(index_tokens_test_bodies_3, model)
test_bodies_sentence_vectors_3 = [t.numpy() for t in test_bodies_sentence_vectors_3]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_bodies_sentence_vectors_3.npy', test_bodies_sentence_vectors_3)

In [0]:
test_heads_sentence_vectors = get_sentence_vectors(index_tokens_test_heads, model)
test_heads_sentence_vectors = [t.numpy() for t in test_heads_sentence_vectors]
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_heads_sentence_vectors.npy', test_heads_sentence_vectors)

In [0]:
# load in sentence vectors
train_heads_sentence_vectors = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_heads_sentence_vectors.npy')

test_heads_sentence_vectors = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_heads_sentence_vectors.npy')

train_bodies_sentence_vectors_1 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_1.npy')
train_bodies_sentence_vectors_2 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_2.npy')
train_bodies_sentence_vectors_3 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_3.npy')
train_bodies_sentence_vectors_4 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_4.npy')
train_bodies_sentence_vectors_5 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/train_bodies_sentence_vectors_5.npy')

test_bodies_sentence_vectors_1 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_bodies_sentence_vectors_1.npy')
test_bodies_sentence_vectors_2 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_bodies_sentence_vectors_2.npy')
test_bodies_sentence_vectors_3 = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/test_bodies_sentence_vectors_3.npy')


In [0]:
# method to reduce dimension of sentence vectors
def reduce_dimension(array):
  return np.squeeze(array)

In [0]:
train_heads_sentence_vectors = reduce_dimension(train_heads_sentence_vectors)
test_heads_sentence_vectors = reduce_dimension(test_heads_sentence_vectors)

train_bodies_sentence_vectors_1 = reduce_dimension(train_bodies_sentence_vectors_1)
train_bodies_sentence_vectors_2 = reduce_dimension(train_bodies_sentence_vectors_2)
train_bodies_sentence_vectors_3 = reduce_dimension(train_bodies_sentence_vectors_3)
train_bodies_sentence_vectors_4 = reduce_dimension(train_bodies_sentence_vectors_4)
train_bodies_sentence_vectors_5 = reduce_dimension(train_bodies_sentence_vectors_5)

test_bodies_sentence_vectors_1 = reduce_dimension(test_bodies_sentence_vectors_1)
test_bodies_sentence_vectors_2 = reduce_dimension(test_bodies_sentence_vectors_2)
test_bodies_sentence_vectors_3 = reduce_dimension(test_bodies_sentence_vectors_3)

In [0]:
# concatenate all training data
train_bodies = np.concatenate((train_bodies_sentence_vectors_1, train_bodies_sentence_vectors_2, train_bodies_sentence_vectors_3,
                              train_bodies_sentence_vectors_4, train_bodies_sentence_vectors_5))


In [0]:
# concatenate all test data
test_bodies = np.concatenate((test_bodies_sentence_vectors_1, test_bodies_sentence_vectors_2, test_bodies_sentence_vectors_3))

In [0]:
# re-save files back to google drive
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_train_heads.npy', train_heads_sentence_vectors)
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_train_bodies.npy', train_bodies)
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_test_heads.npy', test_heads_sentence_vectors)
np.save('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_test_bodies.npy', test_bodies)

In [0]:
bert_train_heads = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_train_heads.npy')
bert_train_bodies = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_train_bodies.npy')
bert_test_heads = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_test_heads.npy')
bert_test_bodies = np.load('gdrive/My Drive/Colab Notebooks/MSC_project_data/bert_test_bodies.npy')