<a href="https://colab.research.google.com/github/ShaanHossain/NLPFinalProject/blob/master/Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive
# from google.colab import drive # import drive from google colab
# ROOT = "/content/drive"     # default location for the drive
# print(ROOT)                 # print content of ROOT (Optional
# drive.mount(ROOT)           # we mount the google drive at /content/drive

# 1. Importing Dependencies

In [2]:
#import files
import sys
from csv import reader
from typing import List
from nltk.tokenize import RegexpTokenizer
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
import numpy as np
import inflect
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch 
import torch.nn as nn
import scipy.io as sp
import sklearn.metrics
# import numpy as np
# import sys
import matplotlib.pyplot as plt

# 2. Initial Setup - Defining the driving variables

In [3]:
# Determines which dataset use and how much to use :
# HateSpeech: Column-0 : Sentence, Column-1 : Label [noHate-0, Hate-1]
# either 'HateSpeech' or 'KaggleTwitter' or 'TDavidson'
dataset_to_use = "HateSpeech"
dataset_percentage = 100  # percentage range 1 to 100

# Initializes file path, column of csv file to parse and
# the delimiter for parsing
training_file = ""
test_file = ""
sentence_column_to_parse = None
label_column_to_parse = None
lancaster = LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
delimiter = ","
if dataset_to_use == "HateSpeech":
    training_file = "datasets/hate-speech/train.txt"
    test_file = "datasets/hate-speech/test.txt"
    delimiter = "\t"
    sentence_column_to_parse = 0
    label_column_to_parse = 1
elif dataset_to_use == "KaggleTwitter":
    training_file = "datasets/kaggle-twitter/train.csv"
    test_file = "datasets/kaggle-twitter/test.csv"
    sentence_column_to_parse = 2
    label_column_to_parse = 1
elif dataset_to_use == "TDavidson":
    training_file = "datasets/t-davidson-hate-speech/labeled_data.csv"
    # TODO: Update test path for this dataset
    # test_file = "datasets/kaggle-twitter/test.csv"
    sentence_column_to_parse = 6
    label_column_to_parse = 2
else:
    print("Invalid Dataset specified")
    sys.exit(1)

# 1. Preprocessing the Data

### Seven tasks are done as part of this:
  - lower word case
  - remove stopwords
  - remove punctuation
  - convert numbers to texts
  - perform stemming
  - Add - \<s> and \</s> for every sentence

In [4]:
def replace_numbers(sentence:List[str]) -> List[str]:
    """Replace all interger occurrences in list of tokenized words
    with textual representation"""
    p = inflect.engine()
    new_words = []  
    for word in sentence.split():
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return " ".join(new_words)

def stem_words(sentence: List[str]) -> List[str]:
    """Stems the given sentence

    Args:
        sentence (list): words to be stemmed

    Returns:
        str: stemmed sentence
    """
    stemmed_words = []
    for word in sentence.split():
        stemmed_words.append(lancaster.stem(word))
    return " ".join(stemmed_words)

def preprocessing(running_lines: List[str]) -> List[str]:
    """This function takes in the running test and return back the
    preprocessed text. Six tasks are done as part of this:
      1. lower word case
      2. remove stopwords
      3. remove punctuation
      4. convert numbers to texts
      5. perform stemming

    Args:
        sentence (List[str]): list of lines

    Returns:
        List[str]: list of sentences which are processed
    """
    preprocessed_lines = []
    tokenizer = RegexpTokenizer(r"\w+")
    for line in running_lines:
        # lower case
        lower_case_data = line.lower()
        # remove stop words
        data_without_stop_word = remove_stopwords(lower_case_data)
        # remove punctunation
        data_without_punct = strip_punctuation(data_without_stop_word)
        # replace numbers '1' to 'one'
        processed_data = replace_numbers(data_without_punct)
        # stem words
        processed_data = stem_words(processed_data)
        # add start and stop tags
        # processed_data.insert(0, "<s>")
        # processed_data.append("</s>")
        preprocessed_lines.append(processed_data)
    return preprocessed_lines

# 2. Parsing the data

In [5]:
def parse_data(training_file_path: str, percentage: int,
               sentence_column: int, label_column: int,
               delimit: str):
    """This function is used to parse input lines
    and returns a the provided percent of data.

    Args:
        lines (List[str]): list of lines
        percentage (int): percent of the dataset needed
        sentence_column (int): sentence column from the dataset
        label_column (int): label column from the dataset
        delimit (str): delimiter
    Returns:
        List[str], List[str]: examples , labels -> [percentage of dataset]
    """
    percentage_sentences = []
    percentage_labels = []
    with open(training_file_path, "r", encoding="utf8",
              errors="ignore") as csvfile:
        read_sentences = []
        label_sentences = []
        csv_reader = reader(csvfile, delimiter=delimit)
        # skipping header
        header = next(csv_reader)
        # line_length = len(list(csv_reader_copy))
        if header is not None:
            for row in csv_reader:
                read_sentences.append(row[sentence_column])
                label_sentences.append(row[label_column])
        end_of_data = int(len(read_sentences) * percentage * .01)
        percentage_sentences = read_sentences[0:end_of_data]
        percentage_labels = label_sentences[0:end_of_data]
    return percentage_sentences, percentage_labels

In [6]:
train_sentences, train_labels = parse_data(training_file,
                                           dataset_percentage,
                                           sentence_column_to_parse,
                                           label_column_to_parse,
                                           delimiter)
# parse and preprocess the data
processed_train_sentences = preprocessing(train_sentences)
# verify the processed sentences
# for sentence in sentences:
#     print(sentence)
# This is the baseline classifier
print(
    f"Performing Improved - BiLSTM on {dataset_to_use}"
    f" with {dataset_percentage} % data ")

Performing Improved - BiLSTM on HateSpeech with 100 % data 


# 3. Building Model

### 3.1 Generating word embeddings

For this step, we intend to use the Keras library to build a recurrent neural network based on bidirectional LSTMs. The model will take word embeddings as input so we will use pre-trained GloVe embeddings to make the embedding dictionary.

In [7]:
def convert_sentence_word_embeddings(X_train_sentences:List[str]):
    """Converts the sentences into word embeddings.

    Args:
        X_train_sentences (List[str]): list of training sentences

    Returns:
        tuple: word embeddings for each sentence, vocab size and embedding dictionary
    """
    tokenizer = Tokenizer()
    text = np.array(X_train_sentences)
    tokenizer.fit_on_texts(X_train_sentences)
    # pickle.dump(tokenizer, open('text_tokenizer.pkl', 'wb'))
    # Uncomment above line to save the tokenizer as .pkl file 
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    embeddings_dict = {}
    file_embeddings = open("utils/glove.twitter.27B.50d.txt", encoding="utf8")
    for embedding_line in file_embeddings:
        values = embedding_line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = coefs
    file_embeddings.close()
    print('Total %s word vectors.' % len(embeddings_dict))
    return (text, word_index, embeddings_dict)

In [None]:
X_train_Glove_s, word_index_s, embeddings_dict_s = convert_sentence_word_embeddings(processed_train_sentences)

#First one is sentence embeddings
#Second one is size of vocab
#Third is embeddings dictionary

In [20]:
## Check function
x_train_sample = ["Lorem Ipsum is simply dummy text of the printing and typesetting industry", "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout"]
# X_train_Glove_s, word_index_s, embeddings_dict_s = convert_sentence_word_embeddings(x_train_sample)
# print("\n X_train_Glove_s \n ", X_train_Glove_s)
print("\n Word index of the word testing is : ", word_index_s["industry"])
print("\n Embedding for the word want \n \n", embeddings_dict_s["want"])
len(embeddings_dict_s["want"])

word_index_to_embedding = {}

for i in embeddings_dict_s.keys():
    print(i)
    word_index_to_embedding[word_index_s[i]] =  embeddings_dict_s[i]

print(word_index_to_embedding[0])
    


 Word index of the word testing is :  3039

 Embedding for the word want 
 
 [ 3.7413e-01  7.1623e-01 -3.5810e-01  7.7834e-02 -9.2033e-01 -3.8558e-01
  5.3116e-01 -5.1657e-01  1.2817e-01  2.6236e-01 -2.7895e-01  2.5293e-01
 -5.5937e+00 -5.7123e-01 -9.1184e-01  1.3920e-01 -5.0184e-01 -1.3487e-01
 -6.3133e-01  1.4302e-01  6.3659e-01  2.1426e-01  3.9087e-01  8.3384e-01
  7.4350e-01  3.7236e-01  5.9994e-01  2.7211e-01  5.7034e-02 -1.3236e+00
  1.5121e-01  4.6511e-01 -1.3622e-03  3.6893e-02  7.5773e-01 -5.9615e-01
  8.3311e-02 -3.2657e-01 -1.1019e-02  2.9374e-01 -1.1935e+00 -3.9529e-01
  1.8498e-01 -6.4563e-01  6.0677e-01 -7.2177e-01  7.4921e-01  9.1771e-02
 -9.6784e-02  3.3105e-01]


TypeError: 'builtin_function_or_method' object is not iterable

In [16]:
def converge_to_binary(x):
    x = np.where(x < .5, -1, 1)
    return x

def converge_to_prob(x):
    x = np.where(x < 0., 0., 1.)
    return x

# Defining input size, hidden layer size, output size and batch size respectively
n_in, n_h, n_out, hidden_layers, activation_function = 50, 10, 1, 1, 'lstm'

x = torch.tensor(X_train_Glove_s).float()
y = torch.tensor(train_labels).float()

class MyModule(nn.Module):
    def __init__(self, 
            n_in, 
            neurons_per_hidden, 
            n_out,
            hidden_layers, 
            activation_function,
            embeddings
        ):

        super(MyModule, self).__init__()

        self.n_in = n_in
        self.n_h = neurons_per_hidden
        self.n_out = n_out
        self.h_l = hidden_layers

        self.a_f = activation_function

        self.embeddings = embeddings

        # Inputs to hidden layer linear transformation
        self.input = nn.Linear(n_in, self.n_h)

        # Embedding layer
        self.embedding_layer = nn.Embedding(num_embedding=len(self.embeddings), embedding_dim=n_in)
        # Load weights into embedding layer
        self.embedding_layer.load_state_dict({'weight': self.embeddings})
        # Do not train embedding layer
        self.weight.requires_grad = False

        #probably need a variable for recurrent dropout and dropout layer dropout

        # Defaults to Relu if activation_function is improperly sp
        self.activation_layer = nn.ReLU()

        if activation_function == 'relu':
            self.activation_layer = nn.ReLU()
        elif activation_function == 'tanh':
            self.activation_layer = nn.Tanh()
        elif activation_function == 'sigmoid':
            self.activation_layer == nn.Sigmoid()
        elif activation_function == 'identity':
            self.activation_layer == nn.Identity()
        elif activation_function == 'lstm':
            self.activation_layer == nn.LSTM(bidirectional=True, dropout=0.2)
        else:
            print("Invalid activation function specified")
            sys.exit(1)

        # self.linears = nn.ModuleList([nn.Linear(self.n_h, self.n_h) for i in range(self.h_l - 1)])
        self.activation_layers = nn.ModuleList([self.activation_layer for i in range(self.h_l - 1)])
        self.dropout_layers = nn.ModuleList([nn.Dropout(0.5) for i in range(self.h_l - 1)])
        
        # Output layer, 10 units - one for each digit
        # self.output = nn.Linear(self.n_h, n_out)
        self.output == nn.ReLU()
        # Define sigmoid output
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):

        x = self.input(x)
        x = self.activation_layer(x)

        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            # x = self.linears[i // 2](x) + l(x)
            x = self.activation_layers[i // 2](x) + l(x)
            x = self.dropout_layers[i // 2](x) + l(x)

        x = self.output(x)
        x = self.sigmoid(x)

        return x

model = nn.Sequential(MyModule(n_in, n_h, n_out, hidden_layers, activation_function))
print(model)

# Construct the loss function
criterion = torch.nn.BCELoss()
# Construct the optimizer (Adam in this case)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.05)

# Optimization
for epoch in range(50):
   # Forward pass: Compute predicted y by passing x to the model
   y_pred = model(x)

   y_pred_numpy = y_pred.detach().numpy()
   y_pred_tanh_range = converge_to_binary(y_pred_numpy)
   y_numpy = y.detach().numpy()
#    y_sigmoid_range = converge_to_prob(y_numpy)

#    print(sklearn.metrics.accuracy_score(y_pred_numpy, y))

   # Compute and print loss
   loss = criterion(y_pred, torch.tensor(y_numpy).float())
#    print('epoch: ', epoch,' loss: ', loss.item())

   # Zero gradients, perform a backward pass, and update the weights.
   optimizer.zero_grad()

   # perform a backward pass (backpropagation)
   loss.backward()

   # Update the parameters
   optimizer.step()

# Plotting training data
# plot_data(X, Y)

y_pred_tanh_range_from_train = y_pred_tanh_range

y_pred = model(x_test)
y_pred_numpy = y_pred.detach().numpy()
y_pred_tanh_range = converge_to_binary(y_pred_numpy)

# print("Training Accuracy")
# print(sklearn.metrics.accuracy_score(y_pred_tanh_range_from_train, y))

# print("Testing Accuracy")
# print(sklearn.metrics.accuracy_score(y_pred_tanh_range, y_test))

# Plotting the test data
# plot_data(x_test, y_pred_tanh_range)

ValueError: too many dimensions 'str'