In [5]:
import pandas as pd
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm 
import codecs
import random
from matplotlib import pyplot as plt
import os
import zipfile
import re
from torch.utils.data import Dataset, DataLoader


# we fix the seeds to get consistent results

SEED = 234
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


# the following makes it determnisitic but may have performance impact:

# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

# Load data - from Github
train_url = 'https://raw.githubusercontent.com/JeanKaddour/OffensEval2019-TeamMDJ/master/data/start_kit/training-v1/offenseval-training-v1.tsv'
val_url   = 'https://raw.githubusercontent.com/JeanKaddour/OffensEval2019-TeamMDJ/master/data/start_kit/trial-data/offenseval-trial.txt'
test1_url = 'https://raw.githubusercontent.com/JeanKaddour/OffensEval2019-TeamMDJ/master/data/test_a_release/testset-taska.tsv'
test2_url = 'https://raw.githubusercontent.com/JeanKaddour/OffensEval2019-TeamMDJ/master/data/test_b_release/testset-taskb.tsv'
test3_url = 'https://raw.githubusercontent.com/JeanKaddour/OffensEval2019-TeamMDJ/master/data/test_c_release/test_set_taskc.tsv'
train_set = pd.read_csv(train_url, sep='\t', na_filter=False)
val_set   = pd.read_csv(val_url, sep='\t', na_filter=False)
test_set1 = pd.read_csv(test1_url, sep='\t', na_filter=False)
test_set2 = pd.read_csv(test2_url, sep='\t', na_filter=False)
test_set3 = pd.read_csv(test3_url, sep='\t', na_filter=False)

#wvecs = np.load("wvecs.npy")
print(train_set.head())

"""Tokenizes and preprocesses a corpus. Adapted from Romain Paulus and Jeffrey\
Pennington from https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb but
changed it to make it work"""
def get_tokenized_corpus(corpus, verbose=False):
  tokenized_corpus = []
  for sentence in tqdm(corpus):
    tokenized_sentence = []
    for token in sentence.split(' '):
      token = token.strip()
      if token == '':
        continue
      # Detect URLs
      match = re.search("https?:\/\/\S+\b|www\.(\w+\.)+\S*",token)
      if match:
        token = '<url> ' + match[0]
        if verbose: print("url: ", token)
      token = token.replace("@USER", "<user>")
      token = re.sub("@\w+","<user>",token)
      token = re.sub("[8:=;]['`\-][)d]+|[)d]+['`\-][8:=;]"," <smile>",token)
      token = re.sub("[8:=;]['`\-]p+"," <lolface>",token)
      token = re.sub("[8:=;]['`\-]\(+|\)+['`\-][8:=;]"," <sadface>",token)
      token = re.sub("[8:=;]['`\-][\/|l*]", " <neutralface>",token)
      token = re.sub("<3", " <heart>", token)
      token = re.sub("^[-+]?[.\d]*[\d]+[:,.\d]*$", " <number>", token)
      # Split hashtags on uppercase letters
      hashtag = re.search("#\S+", token)
      if hashtag:
        hashtag_body = hashtag[0][1:]
        if hashtag_body == hashtag_body.upper():
          token = "<hashtag> " + hashtag_body + " <hashtag_end>"
        else:
          token = "<hashtag> "
          parts = re.findall("[A-Z]*[a-z]+", hashtag_body)
          for part in parts:
            token += part + " "
          token += "<hashtag_end>"
        if verbose: print("hashtag: ", token)
      # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>"
      match = re.search("(?P<word>[^!.,?:;]*)(?P<punctuation>[!.,?:;]){2,}$", token)
      if match:
        token = match["word"] + " " + match["punctuation"] + " <repeat>"
        if verbose: print("rep punct: ", token)
      # Split single punctuation:
      else:
        match = re.search("(?P<word>.*)(?P<punctuation>[!.,?:;])$", token)
        if match:
          token = match["word"] + " " + match["punctuation"]
          if verbose: print("single punct: ", token)
      match = re.search("(?P<p>.* *)(?P<w>[A-Z]{2,})(?P<r>.*)",token)
      if match:
        if match["r"] != " ":
          token = match["p"] + match["w"] + " <allcaps> " + match["r"]
        else:
          token = match["p"] + match["w"] + " <allcaps>"
        if verbose: print("allcaps: ", token)
      token = token.lower()
      token = token.strip()
      tokens = token.split(" ")
      for token in tokens:
        if token != "":
          tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)
  return tokenized_corpus

def print_corpus_tokens(tokenized_corpus, corpus, sentences):
  for sent in sentences:
    print(corpus.values[sent,1])
    print(tokenized_corpus[sent])
    print()
    
def get_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)
  word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  # we reserve the 0 index for the placeholder token
  word2idx['<pad>'] = 0
  return word2idx

tokenized_train_corpus = get_tokenized_corpus(train_set.values[:,1])
word2idx = get_word2idx(tokenized_train_corpus)

wvecs = np.zeros((len(word2idx), 100))

with codecs.open('glove.twitter.27B/glove.twitter.27B.100d.txt', 'r','utf-8') as f: 
  index = 0
  for line in tqdm(f.readlines()):
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in word2idx:
          (word, vec) = (word, list(map(float,line.strip().split()[1:])))
          idx = word2idx[word]
          wvecs[idx] = vec
      np.random.normal(scale=0.01, size=(100, ))

  1%|          | 103/13240 [00:00<00:12, 1026.54it/s]

      id                                              tweet subtask_a  \
0  86426  @USER She should ask a few native Americans wh...       OFF   
1  90194  @USER @USER Go home you’re drunk!!! @USER #MAG...       OFF   
2  16820  Amazon is investigating Chinese employees who ...       NOT   
3  62688  @USER Someone should'veTaken" this piece of sh...       OFF   
4  43605  @USER @USER Obama wanted liberals &amp; illega...       NOT   

  subtask_b subtask_c  
0       UNT      NULL  
1       TIN       IND  
2      NULL      NULL  
3       UNT      NULL  
4      NULL      NULL  


100%|██████████| 13240/13240 [00:03<00:00, 3990.88it/s]
100%|██████████| 1193515/1193515 [00:18<00:00, 64071.23it/s]


In [6]:
np.save("wvecs.npy",wvecs)