# Training - Cornell Dataset

## Imports

In [62]:
from tensorflow.keras import regularizers, layers, losses, preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from helper_fxns import remove_url, clean_text
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sb
import pandas as pd
import numpy as np
import os, shutil
import string
import nltk
import sys
import re

import pydot

## Checking GPU

In [38]:
if tf.test.gpu_device_name():
  print("Default GPU Device: {}".format(tf.test.gpu_device_name()))
else:
  print('Please install GPU version of TF')

Default GPU Device: /device:GPU:0Metal device set to: Apple M1



2022-10-28 23:25:15.699859: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-28 23:25:15.700261: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-10-28 23:25:15.701613: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-28 23:25:15.701621: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)



systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



## Converting the Cornell Dataset into DataFrames

The codes in this subsection only needs to be run once, as the data has already been combined into 1 csv file and is saved. Hence, the next time round, this section can just be skipped and the .csv file can just be loaded in directly.

In [27]:
def create_df(tag):
    DIR = os.path.join("review_polarity/txt_sentoken/", tag)
    contents = os.listdir(DIR)

    text = []

    if ".DS_Store" in contents:
        contents.remove(".DS_Store")

    for file in contents:
        file_path = os.path.join(DIR, file)
        with open(file_path) as f:
            lines = f.read()
        lines = lines.replace("\n", " ")
        text.append(lines)
    
    num_texts = len(contents)
    label = [tag] * num_texts

    if len(text) == len(label):
        print("INFO: Correct Length of Text and Label!")
    else:
        print(f"ERROR: Mismatch of Lengths Text : {len(text)}, Label : {len(label)}")
        sys.exit(1)

    data_dict = {"text" : text, "sentiment" : label}

    data_df = pd.DataFrame(data_dict)
    return data_df

In [28]:
pos_df = create_df('pos')
neg_df = create_df('neg')
pos_df.head()

INFO: Correct Length of Text and Label!
INFO: Correct Length of Text and Label!


Unnamed: 0,text,sentiment
0,assume nothing . the phrase is perhaps one of...,pos
1,plot : derek zoolander is a male model . he i...,pos
2,i actually am a fan of the original 1961 or so...,pos
3,a movie that's been as highly built up as the ...,pos
4,""" good will hunting "" is two movies in one : ...",pos


In [29]:
neg_df.head()

Unnamed: 0,text,sentiment
0,bad . bad . bad . that one word seems to pre...,neg
1,isn't it the ultimate sign of a movie's cinema...,neg
2,""" gordy "" is not a movie , it is a 90-minute-...",neg
3,disconnect the phone line . don't accept the ...,neg
4,when robert forster found himself famous again...,neg


### Concatenating the DataFrames

In [30]:
df = pd.concat([pos_df, neg_df], ignore_index=True)
df

Unnamed: 0,text,sentiment
0,assume nothing . the phrase is perhaps one of...,pos
1,plot : derek zoolander is a male model . he i...,pos
2,i actually am a fan of the original 1961 or so...,pos
3,a movie that's been as highly built up as the ...,pos
4,""" good will hunting "" is two movies in one : ...",pos
...,...,...
1995,synopsis : when a meteorite crashlands in the ...,neg
1996,it's now the anniversary of the slayings of ju...,neg
1997,coinciding with the emerging popularity of mov...,neg
1998,and now the high-flying hong kong style of fil...,neg


### Writing the DataFrame into a csv

In [34]:
df.to_csv('cornell_polarity_combined.csv')

## Preprocessing

### Helper Functions to Remove URL and Clean Text

Importing the remove_url and clean_text fucntions to remove any URL in the text and to remove punctuations in the text.

In [41]:
df['text'] = df['text'].apply(remove_url)
df['text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,sentiment
0,assume nothing the phrase perhaps one the most...,pos
1,plot derek zoolander male model also very dumb...,pos
2,actually fan the original liveactiondisney fli...,pos
3,movie thats been highly built the truman show ...,pos
4,good will hunting two movies one independent t...,pos


In [32]:
print("Number of texts in each of the Classes: ")
df['sentiment'].value_counts()

Number of texts in each of the Classes: 


pos    1000
neg    1000
Name: sentiment, dtype: int64

By looking at the value counts, we can see that the dataset is balanced and there is an equal distribution between Positive and Negative sentiments.

## Tokenization

Updates a list on vocabulary based on the text. It kind of creates its Vocabulary. This is important before converting texts into sequences. 

In [44]:
#Splitting the dataset into (train+val), test
num_words = 1000

tokenizer = Tokenizer(num_words=num_words,oov_token="unk")
tokenizer.fit_on_texts(df['text'].tolist())

### Splitting Training Sets into Different Splits

In [52]:
#Prior to splitting
print(f"Length of dataset: {len(df)}")
print(f"Length of Train: {int(0.8 * len(df))}")
print(f"Length of Valid: {int(0.2 * len(df))}")

Length of dataset: 2000
Length of Train: 1600
Length of Valid: 400


For test set, we will use the dataset.csv file, which can be loaded in after the training of this model as well.

In [53]:
#splitting into training and validation data
X_train, X_valid, y_train, y_valid = train_test_split(df['text'].tolist(), 
                                                  df['sentiment'].tolist(), 
                                                  test_size=0.2, stratify=df['sentiment'].tolist(),
                                                  random_state=0)

In [56]:
print(f"Length of Train : {len(X_train)}")
print(f"Length of Valid : {len(X_valid)}")

Length of Train : 1600
Length of Valid : 400


In [57]:
#getting the class distribution
print("Class Distributions:")
print(f"Train: {str(Counter(y_train))}")
print(f"Valid: {str(Counter(y_valid))}")

Class Distributions:
Train: Counter({'neg': 800, 'pos': 800})
Valid: Counter({'neg': 200, 'pos': 200})


### Converting Texts to Sequences and Storing them in Arrays

Assigning an index to each word in the text sample. Do note that for the Tokenization, 0 is not assigned to any word, it is instead assigned to any new/unknown word that is not present in the vocabulary.

In [58]:
#Converts the texts to sequences and stores it in an array
x_train = np.array(tokenizer.texts_to_sequences(X_train))
x_valid = np.array(tokenizer.texts_to_sequences(X_valid))

  x_train = np.array(tokenizer.texts_to_sequences(X_train))
  x_valid = np.array(tokenizer.texts_to_sequences(X_valid))


### Finding the Longest Word Sequence

This is important as this kind of defines what the shape of the input should be.

In [61]:
#finding the longest word sequence
train_num_words_text = pd.DataFrame(X_train, columns=['text'])['text'].apply(lambda x:len(str(x).split()))
max_len_train = max(train_num_words_text)

valid_num_words_text = pd.DataFrame(X_valid, columns=['ext'])['text'].apply(lambda x:len(str(x).split()))
max_len_valid = max(valid_num_words_text)

test_num_words_text = pd.DataFrame(X_test, columns=['Text'])['Text'].apply(lambda x:len(str(x).split()))
max_len_test = max(test_num_words_text)

print('Longest Sentence in terms of words in Train: ', max_len_train)
print('Longest Sentence in terms of words in Valid: ', max_len_valid)
print('Longest Sentence in terms of words in Test: ', max_len_test)

Number of Words in the Longest Sentence in Train :  1512
Number of Words in the Longest Sentence in Valid :  1880


Therefore, in order to account for the both Trian and the Valid 