In [1]:
import os
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl

In [3]:
import gensim.downloader as dl
from gensim.models import KeyedVectors

In [8]:
pretrained_weights_name = "word2vec-google-news-300"
model_dl_path = os.path.join(
    dl.BASE_DIR, pretrained_weights_name, f"{pretrained_weights_name}.gz")


if os.path.exists(model_dl_path):
    # load model
    print(f"Loading model from {model_dl_path}")
    gnews_embeddings = dl.load(pretrained_weights_name)
else:
    # download
    print(f"Model will be downloaded at {model_dl_path}")
    gnews_embeddings = dl.load("word2vec-google-news-300")


Loading model from /Users/shawon/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [10]:
vocabulary = gnews_embeddings.index_to_key
vocab_len = len(vocabulary)
vocab_len

3000000

In [17]:
# https://github.com/Oneplus/Tweebank

train_file = "/Users/shawon/Downloads/Tweebank-dev/converted/en-ud-tweet-train.fixed.conllu"
with open(train_file) as f:
    data = f.readlines()

data

['# tweet_id = feb_jul_16.1463316480\n',
 "# text = RT @USER991: Dear diary,       I've been rapping in 3 accents and no longer know which one is truly mine. I am a sadting - Drake URL217…\n",
 '1\tRT\trt\tX\t_\t_\t10\tdiscourse\t_\t_\n',
 '2\t@USER991\t@USER\tX\t_\t_\t1\tdiscourse\t_\tSpaceAfter=No\n',
 '3\t:\t:\tPUNCT\t_\t_\t1\tpunct\t_\t_\n',
 '4\tDear\tdear\tADJ\t_\t_\t5\tamod\t_\t_\n',
 '5\tdiary\tdiary\tNOUN\t_\t_\t10\tvocative\t_\tSpaceAfter=No\n',
 '6\t,\t,\tPUNCT\t_\t_\t10\tpunct\t_\t_\n',
 '7\tI\ti\tPRON\t_\t_\t10\tnsubj\t_\tSpaceAfter=No\n',
 "8\t've\t've\tAUX\t_\t_\t10\taux\t_\t_\n",
 '9\tbeen\tbe\tAUX\t_\t_\t10\taux\t_\t_\n',
 '10\trapping\trap\tVERB\t_\t_\t0\troot\t_\t_\n',
 '11\tin\tin\tADP\t_\t_\t13\tcase\t_\t_\n',
 '12\t3\tNUMBER\tNUM\t_\t_\t13\tnummod\t_\t_\n',
 '13\taccents\taccent\tNOUN\t_\t_\t10\tobl\t_\t_\n',
 '14\tand\tand\tCCONJ\t_\t_\t17\tcc\t_\t_\n',
 '15\tno\tno\tADV\t_\t_\t16\tadvmod\t_\t_\n',
 '16\tlonger\tlonger\tADV\t_\t_\t17\tadvmod\t_\t_\n',
 '17\tknow\

In [19]:
# break line at every "\n"
tweets = list()
buffer = list()
for idx, tw in enumerate(data):
    if tw == "\n":
        # one partition here
        tweets.append(buffer)
        buffer = []
    else:
        # keep appending
        buffer.append(tw)
        
tweets[0]

['# tweet_id = feb_jul_16.1463316480\n',
 "# text = RT @USER991: Dear diary,       I've been rapping in 3 accents and no longer know which one is truly mine. I am a sadting - Drake URL217…\n",
 '1\tRT\trt\tX\t_\t_\t10\tdiscourse\t_\t_\n',
 '2\t@USER991\t@USER\tX\t_\t_\t1\tdiscourse\t_\tSpaceAfter=No\n',
 '3\t:\t:\tPUNCT\t_\t_\t1\tpunct\t_\t_\n',
 '4\tDear\tdear\tADJ\t_\t_\t5\tamod\t_\t_\n',
 '5\tdiary\tdiary\tNOUN\t_\t_\t10\tvocative\t_\tSpaceAfter=No\n',
 '6\t,\t,\tPUNCT\t_\t_\t10\tpunct\t_\t_\n',
 '7\tI\ti\tPRON\t_\t_\t10\tnsubj\t_\tSpaceAfter=No\n',
 "8\t've\t've\tAUX\t_\t_\t10\taux\t_\t_\n",
 '9\tbeen\tbe\tAUX\t_\t_\t10\taux\t_\t_\n',
 '10\trapping\trap\tVERB\t_\t_\t0\troot\t_\t_\n',
 '11\tin\tin\tADP\t_\t_\t13\tcase\t_\t_\n',
 '12\t3\tNUMBER\tNUM\t_\t_\t13\tnummod\t_\t_\n',
 '13\taccents\taccent\tNOUN\t_\t_\t10\tobl\t_\t_\n',
 '14\tand\tand\tCCONJ\t_\t_\t17\tcc\t_\t_\n',
 '15\tno\tno\tADV\t_\t_\t16\tadvmod\t_\t_\n',
 '16\tlonger\tlonger\tADV\t_\t_\t17\tadvmod\t_\t_\n',
 '17\tknow\

In [26]:
# format for tokens
# number - word - lemma - pos - _ - _ - id - role, -, - 

'4\tDear\tdear\tADJ\t_\t_\t5\tamod\t_\t_\n'.split("\t")


['dear', 'ADJ', '_', '_', '5', 'amod', '_', '_\n']

In [41]:
# we need idx 1, 2,3 : word, lemma and pos

class ConlluRowInfo:
    word: str
    lemma: str
    pos: str
    
    def __init__(self, word: str, lemma: str, pos: str) -> None:
        self.word = word
        self.lemma = lemma
        self.pos = pos
        
    def __str__(self) -> str:
        rep = {
            self.word,
            self.lemma,
            self.pos
        }
        return str(rep)

In [42]:
from typing import List

class ConlluRow:
    info: List[ConlluRowInfo]
    text: str
    
    def __init__(self, infos: List[ConlluRowInfo], text: str) -> None:
        self.info = infos
        self.text = text
        
    def __str__(self) -> str:
        return f"info : {self.info} text: {self.text}"

In [46]:
structured_tweets = list()

for tweet in tweets:
    text = tweet[1].replace("# text = ", "")
    info_in_tweet = list()
    for infos in tweet[2:]:
        buffer = infos.split("\t")
        try:
            word = buffer[1]
            lemma = buffer[2]
            tag = buffer[3]
            info_in_tweet.append(ConlluRowInfo(word, lemma, tag))
        except IndexError:
            print(buffer)
        except AttributeError as e:
            print(e.name)
    structured_tweets.append(ConlluRow(info_in_tweet, text))  

In [47]:
print(structured_tweets[0].text)


RT @USER991: Dear diary,       I've been rapping in 3 accents and no longer know which one is truly mine. I am a sadting - Drake URL217…



In [48]:
sample = structured_tweets[0]
sample.text

"RT @USER991: Dear diary,       I've been rapping in 3 accents and no longer know which one is truly mine. I am a sadting - Drake URL217…\n"

In [50]:
for inf in sample.info:
    print(inf.word, inf.lemma)

RT rt
@USER991 @USER
: :
Dear dear
diary diary
, ,
I i
've 've
been be
rapping rap
in in
3 NUMBER
accents accent
and and
no no
longer longer
know know
which which
one one
is be
truly truly
mine mine
. .
I i
am be
a a
sadting sadting
- -
Drake drake
URL217 URL
… …
