In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Strip Data

In [None]:
import glob
import pandas as pd

train_df = pd.DataFrame(columns = ["Author", "Article"], dtype = str)
for i, filename in enumerate(glob.iglob("/content/gdrive/My Drive/C50/C50train/*/*.txt")):
  if i % 100 == 0:
    print(i)
  author = filename.split("/")[6]
  with open(filename) as file:
    text = file.readlines()
  train_df.loc[i] = [author, text]

In [None]:
test_df = pd.DataFrame(columns = ["Author", "Article"], dtype = str)
for i, filename in enumerate(glob.iglob("/content/gdrive/My Drive/C50/C50test/*/*.txt")):
  if i % 100 == 0:
    print(i)
  author = filename.split("/")[6]
  with open(filename) as file:
    text = file.readlines()
  test_df.loc[i] = [author, text]

In [None]:
import os

os.chdir("/content/gdrive/My Drive/C50/")
train_df.to_csv("train_df.csv", index = False)
test_df.to_csv("test_df.csv", index = False)

Preprocessing

In [None]:
import re

def parse_sentence(sentence):
  words = []
  for word in sentence.split():
    parsed_word = re.sub("\W", "", word)
    parsed_word = parsed_word.lower()
    if parsed_word:
      words.append(parsed_word)
  return words

In [None]:
def parse_article(text):
  sentences = []
  sentence_lengths = []
  article = text.split("\\n")
  for sentence in article:
    parsed_sentence = parse_sentence(sentence)
    if parsed_sentence:
      sentence_length = len(parsed_sentence)
      sentences.append(parsed_sentence)
      sentence_lengths.append(sentence_length)
  return sentences, sentence_lengths

In [None]:
import numpy as np

glove_embeddings = {}
with open("/content/gdrive/My Drive/C50/glove.6B.50d.txt") as embedding_file:
  for index, line in enumerate(embedding_file):
    embedding = line.split()
    word = embedding[0]
    vector = np.asarray(embedding[1:], "float32")
    glove_embeddings[word] = vector

In [None]:
import torch

word_to_index = {}

def preprocess(article, return_lengths = False):
  sentences, sentence_lengths = parse_article(article)
  if return_lengths:
    return sentence_lengths
  max_sentence_length = max(sentence_lengths)
  article_tensor = torch.zeros(len(sentences), max_sentence_length, dtype = torch.int64)
  for i, sentence in enumerate(sentences):
    sentence_length = sentence_lengths[i]
    for j, word in enumerate(sentence):
      index = word_to_index.get(word)
      if index is None:
        if word in glove_embeddings.keys():
          index = len(word_to_index) + 1
          word_to_index[word] = index
        else:
          index = 0
      article_tensor[i, j] = index
    for j in range(sentence_length, max_sentence_length):
      article_tensor[i, j] = 0
  return article_tensor

In [None]:
import os
import pandas as pd

os.chdir("/content/gdrive/My Drive/C50/")
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

In [None]:
authors = train_df["Author"].unique()
author_dict = {author:index for (index, author) in enumerate(authors)}

for author in authors:
  author_df = test_df[test_df["Author"] == author]
  author_train = author_df.sample(frac = 0.8, random_state = 87)
  train_df = train_df.append(author_train, True)
  test_df.drop(index = author_train.index, inplace = True)
test_df.index = range(test_df.shape[0])

In [None]:
train_df["Tensors"] = train_df["Article"].apply(preprocess)
train_df["Sentence Lengths"] = train_df["Article"].apply(preprocess, return_lengths = True)
test_df["Tensors"] = test_df["Article"].apply(preprocess)
test_df["Sentence Lengths"] = test_df["Article"].apply(preprocess, return_lengths = True)

In [None]:
initial_embeddings = torch.zeros((len(word_to_index) + 1, 50), dtype = torch.float32)
for word in word_to_index.keys():
  index = word_to_index[word]
  embedding = glove_embeddings[word]
  initial_embeddings[index, :] = torch.from_numpy(embedding)

Sentence Level GRU

In [None]:
from torch import nn

class SentenceGRU(nn.Module):

  def __init__(self, hidden_size, initial_embeddings):
    super().__init__()
    self.embedding_dim = initial_embeddings.size(1)
    self.embedding = nn.Embedding.from_pretrained(initial_embeddings, False, 0)
    self.gru = nn.GRU(self.embedding_dim, hidden_size, batch_first = True)
    self.linear = nn.Linear(hidden_size, 50)

  def forward(self, article_tensor, sentence_lengths):
    embeddings = self.embedding(article_tensor)
    gru_inputs = nn.utils.rnn.pack_padded_sequence(embeddings, sentence_lengths, True, False)
    gru_outputs, hidden_state = self.gru(gru_inputs)
    predictions = self.linear(hidden_state[0, :, :])
    return predictions

In [None]:
from torch import optim

hidden_size = 300
learning_rate = 0.02
num_epochs = 10

model = SentenceGRU(hidden_size, initial_embeddings).cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), learning_rate)

In [None]:
model.train()
batches = train_df.shape[0]
train_loop_df = train_df.sample(frac = 1, random_state = 2).reset_index()
for epoch in range(num_epochs):
  print("Epoch " + str(epoch))
  for batch in range(batches):
    tensor = train_loop_df.loc[batch, "Tensors"]
    sentence_lengths = train_loop_df.loc[batch, "Sentence Lengths"]
    
    inputs = tensor.cuda()
    outputs = model(inputs, sentence_lengths)

    batch_size = tensor.size(0)
    author = train_loop_df.loc[batch, "Author"]
    author_tensor = torch.LongTensor([author_dict[author] for batch in range(batch_size)]).cuda()

    loss = loss_fn(outputs, author_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
      print("Batch " + str(batch) + ", loss is " + str(loss.item()))

Epoch 0
Batch 0, loss is 4.066736698150635
Batch 100, loss is 4.0660529136657715
Batch 200, loss is 4.077706336975098
Batch 300, loss is 3.672333240509033
Batch 400, loss is 3.476971387863159
Batch 500, loss is 3.8298075199127197
Batch 600, loss is 3.8775546550750732
Batch 700, loss is 3.6111555099487305
Batch 800, loss is 3.7829079627990723
Batch 900, loss is 3.6850340366363525
Batch 1000, loss is 4.061131954193115
Batch 1100, loss is 3.8233444690704346
Batch 1200, loss is 3.7961812019348145
Batch 1300, loss is 4.101495265960693
Batch 1400, loss is 3.7292892932891846
Batch 1500, loss is 3.773738384246826
Batch 1600, loss is 3.635838270187378
Batch 1700, loss is 3.7024927139282227
Batch 1800, loss is 3.5741124153137207
Batch 1900, loss is 3.458127498626709
Batch 2000, loss is 3.308328151702881
Batch 2100, loss is 3.692006826400757
Batch 2200, loss is 3.437983989715576
Batch 2300, loss is 3.9135892391204834
Batch 2400, loss is 4.361871242523193
Batch 2500, loss is 3.5496573448181152
Bat

In [None]:
model.eval()
batches = test_df.shape[0]
final_labels = None
final_probs = None
for batch in range(batches):
  tensor = test_df.loc[batch, "Tensors"]
  sentence_lengths = test_df.loc[batch, "Sentence Lengths"]

  inputs = tensor.cuda()
  
  with torch.no_grad():
    outputs = model(inputs, sentence_lengths)
  prob_tensor = nn.functional.softmax(outputs, 1)

  batch_size = tensor.shape[0]
  author = test_df.loc[batch, "Author"]
  label_tensor = torch.LongTensor([author_dict[author] for batch in range(batch_size)])

  if final_labels is None:
    final_labels = label_tensor
  else:
    final_labels = torch.cat([final_labels, label_tensor])
  
  if final_probs is None:
    final_probs = prob_tensor
  else:
    final_probs = torch.cat([final_probs, prob_tensor])

In [None]:
from sklearn import metrics

labels = final_labels.numpy()
probs = final_probs.cpu().numpy()
preds = np.argmax(probs, 1)

metrics.accuracy_score(labels, preds)

0.39871491376394996

Article Level GRU

In [None]:
def pad_article_tensor(tensor, second_sequence_length):
  if tensor.size(1) > second_sequence_length:
    return tensor[:, :second_sequence_length]
  article_length = tensor.size(0)
  while tensor.size(1) < second_sequence_length:
    magic_word_tensor = torch.LongTensor([0 for i in range(article_length)])
    mw_unsqueezed = magic_word_tensor.unsqueeze(1)
    tensor = torch.cat([tensor, mw_unsqueezed], 1)
  return tensor

In [None]:
def pad_batch(tensor, article_lengths):
  max_article_length = max(article_lengths)
  while tensor.size(0) < max_article_length:
    magic_word_tensor = torch.LongTensor([0 for i in range(tensor.size(1))])
    mw_unsqueezed = magic_word_tensor.unsqueeze(0)
    tensor = torch.cat([tensor, mw_unsqueezed], 0)
  return tensor

In [None]:
from torch import nn

class ArticleGRU(nn.Module):

  def __init__(self, sequence_length, hidden_size, initial_embeddings):
    super().__init__()
    self.embedding_dim = initial_embeddings.size(1)
    self.embedding = nn.Embedding.from_pretrained(initial_embeddings, False, 0)
    self.pool = nn.AvgPool2d((sequence_length, 1))
    self.gru = nn.GRU(self.embedding_dim, hidden_size, batch_first = True)
    self.linear = nn.Linear(hidden_size, 50)

  def forward(self, articles, article_lengths):
    embeddings = self.embedding(articles)
    pooling_outputs = self.pool(embeddings)
    gru_inputs = nn.utils.rnn.pack_padded_sequence(pooling_outputs[:, :, 0, :], article_lengths, True, False)
    gru_outputs, hidden_state = self.gru(gru_inputs)
    predictions = self.linear(hidden_state[0, :, :])
    return predictions

In [None]:
from torch import optim

second_sequence_length = 25
hidden_size = 300
learning_rate = 0.1
num_epochs = 20
batch_size = 10

model = ArticleGRU(second_sequence_length, hidden_size, initial_embeddings).cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), learning_rate)

In [None]:
model.train()
batches = int(train_df.shape[0] / batch_size)
train_loop_df = train_df.sample(frac = 1, random_state = 2).reset_index()
for epoch in range(num_epochs):
  print("Epoch " + str(epoch))
  for batch in range(batches):
    batch_start = batch * batch_size
    batch_end = batch_start + batch_size - 1
    batch_tensors = train_loop_df.loc[batch_start:batch_end, "Tensors"]
    batch_lengths = train_loop_df.loc[batch_start:batch_end, "Sentence Lengths"]

    padded_tensors = [pad_article_tensor(tensor, second_sequence_length) for tensor in batch_tensors]
    article_lengths = [len(sentence_lengths) for sentence_lengths in batch_lengths]
    articles_tensor = torch.cat([pad_batch(tensor, article_lengths).unsqueeze(0) for tensor in padded_tensors], 0)

    inputs = articles_tensor.cuda()
    outputs = model(inputs, article_lengths)

    batch_authors = train_loop_df.loc[batch_start:batch_end, "Author"]
    author_tensor = torch.LongTensor([author_dict[author] for author in batch_authors]).cuda()
  
    loss = loss_fn(outputs, author_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 25 == 0:
      print("Batch " + str(batch) + ", loss is " + str(loss.item()))

Epoch 0
Batch 0, loss is 3.9354217052459717
Batch 25, loss is 3.911816120147705
Batch 50, loss is 3.7800402641296387
Batch 75, loss is 3.904961347579956
Batch 100, loss is 3.9620633125305176
Batch 125, loss is 3.8098087310791016
Batch 150, loss is 3.8419299125671387
Batch 175, loss is 3.8408584594726562
Batch 200, loss is 3.8589859008789062
Batch 225, loss is 3.8566062450408936
Batch 250, loss is 3.8191380500793457
Batch 275, loss is 3.769514560699463
Batch 300, loss is 3.9169399738311768
Batch 325, loss is 3.932772159576416
Batch 350, loss is 3.8119895458221436
Batch 375, loss is 3.8819470405578613
Batch 400, loss is 3.683781862258911
Batch 425, loss is 3.5658202171325684
Epoch 1
Batch 0, loss is 3.5090394020080566
Batch 25, loss is 3.373082399368286
Batch 50, loss is 3.2425904273986816
Batch 75, loss is 3.154761552810669
Batch 100, loss is 3.9709160327911377
Batch 125, loss is 3.1432266235351562
Batch 150, loss is 3.629978656768799
Batch 175, loss is 3.400782823562622
Batch 200, loss

In [None]:
model.eval()
batches = int(test_df.shape[0] / batch_size)
final_labels = None
final_probs = None
for batch in range(batches):
  batch_start = batch * batch_size
  batch_end = batch_start + batch_size - 1
  batch_tensors = test_df.loc[batch_start:batch_end, "Tensors"]
  batch_lengths = test_df.loc[batch_start:batch_end, "Sentence Lengths"]

  padded_tensors = [pad_article_tensor(tensor, second_sequence_length) for tensor in batch_tensors]
  article_lengths = [len(sentence_lengths) for sentence_lengths in batch_lengths]
  articles_tensor = torch.cat([pad_batch(tensor, article_lengths).unsqueeze(0) for tensor in padded_tensors], 0)

  inputs = articles_tensor.cuda()
  
  with torch.no_grad():
    outputs = model(inputs, article_lengths)
  prob_tensor = nn.functional.softmax(outputs, 1)

  batch_authors = test_df.loc[batch_start:batch_end, "Author"]
  label_tensor = torch.LongTensor([author_dict[author] for author in batch_authors])

  if final_labels is None:
    final_labels = label_tensor
  else:
    final_labels = torch.cat([final_labels, label_tensor])
  
  if final_probs is None:
    final_probs = prob_tensor
  else:
    final_probs = torch.cat([final_probs, prob_tensor])

In [None]:
from sklearn import metrics

labels = final_labels.numpy()
probs = final_probs.cpu().numpy()
preds = np.argmax(probs, 1)

metrics.accuracy_score(labels, preds)

0.562