# 1. Document Classification

In [1]:
import tensorflow as tf
print(tf.__version__)
tf.config.list_physical_devices('GPU')

2.4.1


[]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
import csv
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /home/sarita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sarita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## (a) Preprocess

In [3]:
def import_pretrained_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

In [4]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [5]:
author_encode = {'fd':0, 'ja': 1, 'acd': 2}

In [6]:
def import_data(filepaths_dict, embedding):
    
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
#     lst_all = []
    x_all = []
    y_all = []
    
    for author, filepath in filepaths_dict.items():
        # get rows from .txt file
        with open(filepath) as f:
            reader = f.read()
            reader = reader.split("\n\n")

            tokenized_paragraphs = []
            print(f"author {author} has {len(reader)} samples")
            # iterate through every text row to clean it up
            for sample_idx, paragraph in enumerate(reader):
                # 1. remove punctuations
                paragraph = paragraph.translate(str.maketrans('','',string.punctuation))
                paragraph = paragraph.replace('\n', ' ')
                # 2. tokenize
                tokens = nltk.word_tokenize(paragraph)
                # 3. remove stop words

                vectors = []
                
                for token in tokens:
                    if not token in stop_words:
                        try:
                            vector = embedding[token.lower()]
                            vectors.append(vector)
                        except KeyError:
                            continue
                            
#                 thisrow = [vectors, author]
#                 lst_all.append(thisrow)
                x_all.append(vectors)
                y_all.append(author_encode[author])
                
    
    return x_all, y_all



Run the functions defined above

In [7]:
embeddings_dict = import_pretrained_embeddings('glove.6B.50d.txt')

In [8]:
filepaths_dict = {'fd': './a4-data/q1/fd.txt',
            'acd': './a4-data/q1/acd.txt',
            'ja': './a4-data/q1/ja.txt'}

x, y = import_data(filepaths_dict, embeddings_dict)

author fd has 6055 samples
author acd has 2548 samples
author ja has 11495 samples


In [9]:
y = np.array(y)
y = tf.keras.utils.to_categorical(y, num_classes=3)

In [10]:
assert(len(x) == len(y) == 6055+2548+11495)

In [11]:
vocab_size = len(embeddings_dict.keys())

In [12]:
x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=500)


### Split into training and testing

In [13]:
X_train, x, y_train, y = train_test_split(x, y, test_size=0.4, random_state=42)

In [14]:
X_val, X_test, y_val, y_test = train_test_split(x, y, test_size=0.25, random_state=28)

In [15]:
print("Sizes of sets:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_val:", X_val.shape)
print("y_val:", y_val.shape)
print("X_test", X_test.shape)
print("y_test", y_test.shape)

Sizes of sets:
X_train: (12058, 500, 50)
y_train: (12058, 3)
X_val: (6030, 500, 50)
y_val: (6030, 3)
X_test (2010, 500, 50)
y_test (2010, 3)


In [16]:
# assert(len(X_train) + len(X_val) + len(X_test) == len(x))

## (b) Build your models

### LSTM using only last hidden state

In [28]:
# define model
last_hs_model = tf.keras.Sequential()
# last_hs_model.add(tf.keras.layers.Embedding(vocab_size, 50, input_length=500))
# model.add(tf.keras.layers.LSTM(100, return_sequences=True))
last_hs_model.add(tf.keras.layers.LSTM(100, input_shape=(500,50)))
last_hs_model.add(tf.keras.layers.Dense(100, activation='relu'))
last_hs_model.add(tf.keras.layers.Dense(3, activation='softmax'))
print(last_hs_model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_10 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 303       
Total params: 70,803
Trainable params: 70,803
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
last_hs_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [35]:
class_weights = {0: 0.6985, 1:0.8732, 2:0.4280}

In [33]:
last_hs_history = last_hs_model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), batch_size=16, epochs=4)

### LSTM using the element wise average of all the hidden states

In [153]:
# define model
all_hs_model = tf.keras.Sequential()
# all_hs_model.add(tf.keras.layers.Embedding(vocab_size, 50))
all_hs_model.add(tf.keras.layers.LSTM(100, return_sequences=True))
all_hs_model.add(tf.keras.layers.GlobalAveragePooling1D())
all_hs_model.add(tf.keras.layers.Dense(100, activation='relu'))
all_hs_model.add(tf.keras.layers.Dense(3, activation='softmax'))
print(all_hs_model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 50)          20000000  
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 100)         60400     
_________________________________________________________________
global_average_pooling1d (Gl (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 303       
Total params: 20,070,803
Trainable params: 20,070,803
Non-trainable params: 0
_________________________________________________________________
None
