In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import regex as re
import pickle
import operator
from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

from utils import *
import helper_dict

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tylerpoore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tylerpoore/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tylerpoore/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# load embeddings
def load_embed(file):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass
        
    print("found embeddings for {:.2%} of vocab".format(len(known_words) / len(vocab)))
    print("found embeddings for {:.2%} of all text".format(nb_known_words / ((nb_known_words + nb_unknown_words))))
    
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    
    return unknown_words

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")
    
def known_contrations(embed, contraction_mapping):
    known = []
    for contraction in contraction_mapping:
        if contraction in embed:
            known.append(contraction)
    return known

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def expand_contractions(text, mapping):
    for word in text.split():
        if word.lower() in mapping.keys():
            text.replace(word, mapping[word.lower()])
    return text

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': '', '\x0e': ''} 
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

In [3]:
%%time
glove = "/Volumes/Data/embeddings/glove.840B.300d.txt"
print('Extracting GloVe embedding')
embed_glove = load_embed(glove)

Extracting GloVe embedding
CPU times: user 3min 22s, sys: 31.4 s, total: 3min 54s
Wall time: 11min 39s


In [4]:
train = pd.read_pickle('data/train.pkl')
test = pd.read_pickle('data/test.pkl')

In [5]:
# # combine for vocab
# train['all_text'] = train['title'] + " " + train['selftext']

In [6]:
vocab = build_vocab(train['title'] + " " + train['selftext'])

In [7]:
oov_glove = check_coverage(vocab, embed_glove)

found embeddings for 32.04% of vocab
found embeddings for 86.39% of all text


In [8]:
add_lower(embed_glove, vocab)

Added 1161 words to embedding


In [9]:
oov_glove = check_coverage(vocab, embed_glove)

found embeddings for 32.12% of vocab
found embeddings for 86.41% of all text


In [10]:
oov_glove[:15]

[("What's", 7287),
 ('says,', 4134),
 ('"I', 4036),
 ('said,', 3608),
 ('Me:', 2053),
 ("he's", 1641),
 ('common?', 1493),
 ("couldn't", 1425),
 ("they're", 1394),
 ('"I\'m', 1285),
 ('replies,', 1274),
 ('"What', 1257),
 ('me,', 1158),
 ('"You', 1146),
 ('says:', 1137)]

In [11]:
contraction_mapping = pd.read_pickle('helper_contractions.pickle')
known_contrations(embed_glove, contraction_mapping)

["can't",
 "'cause",
 "didn't",
 "doesn't",
 "don't",
 "I'd",
 "I'll",
 "I'm",
 "I've",
 "i'd",
 "i'll",
 "i'm",
 "i've",
 "it's",
 "ma'am",
 "o'clock",
 "that's",
 "you'll",
 "you're"]

In [12]:
train[['title', 'selftext']] = train[['title', 'selftext']].applymap(lambda x: clean_contractions(x, contraction_mapping))
train[['title', 'selftext']] = train[['title', 'selftext']].applymap(lambda x: expand_contractions(x, contraction_mapping))

In [13]:
vocab = build_vocab(train['title'] + " " + train['selftext'])
oov_glove = check_coverage(vocab, embed_glove)

found embeddings for 32.13% of vocab
found embeddings for 87.03% of all text


In [14]:
punct_mapping = pd.read_pickle('helper_punct_dict.pickle')
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
train[['title', 'selftext']] = train[['title', 'selftext']].applymap(lambda x: clean_special_chars(x, punct, punct_mapping))
train[['title', 'selftext']] = train[['title', 'selftext']].applymap(lambda x: expand_contractions(x, contraction_mapping))
vocab = build_vocab(train['title'] + " " + train['selftext'])
oov_glove = check_coverage(vocab, embed_glove)

found embeddings for 84.55% of vocab
found embeddings for 99.67% of all text


In [15]:
len(embed_glove)

2197177

In [16]:
len(vocab)

78321

In [17]:
oov_glove

[('TIFU', 140),
 ('Brexit', 100),
 ('vaxxers', 83),
 ('COVID', 73),
 ('clickbait', 72),
 ('vaxxer', 67),
 ('Covid', 39),
 ('sleevies', 37),
 ('spaghetto', 31),
 ('teethbrush', 26),
 ('vaxx', 23),
 ('Rhazim', 23),
 ('Denephew', 23),
 ('jalapeo', 22),
 ('incel', 20),
 ('hunat', 19),
 ('Rubbit', 19),
 ('Clenches', 18),
 ('prozzies', 17),
 ('Clickbait', 17),
 ('ABCDEFGHIJK', 17),
 ('labracadabrador', 15),
 ('justwater', 15),
 ('impasta', 15),
 ('irrelephant', 15),
 ('whalecum', 14),
 ('thicc', 14),
 ('NaBrO', 13),
 ('brexit', 13),
 ('tweetment', 13),
 ('neverlands', 13),
 ('Reintarnation', 13),
 ('Spaghetto', 13),
 ('Vaxxer', 12),
 ('Elephino', 12),
 ('1023MB', 12),
 ('K9P', 12),
 ('SJWs', 12),
 ('Labracadabrador', 12),
 ('WEREN', 11),
 ('Konichihuahua', 11),
 ('myshelf', 11),
 ('covfefe', 11),
 ('koalafications', 11),
 ('oinkment', 11),
 ('6ix9ine', 10),
 ('rubbit', 10),
 ('Homiecide', 10),
 ('incels', 10),
 ('knotsies', 10),
 ('CIEIO', 10),
 ('Riceless', 10),
 ('Whatll', 10),
 ('Vaxxers'

NameError: name 'model' is not defined

In [19]:
train.columns

Index(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title',
       'score', 'selftext_len', 'title_len', 'target'],
      dtype='object')