In [52]:
# Imports
# General
import pandas as pd
import numpy as np

# NLP
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Pytorch - ML


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\48694\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [38]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [39]:
train_df['label'].value_counts()

label
0    29720
1     2242
Name: count, dtype: int64

# NLP

In [49]:
# Load word vectors
words = dict()

def load_embedding(dictionary: dict, filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for lines in f.readlines():
            line = lines.split(' ')
            # print(f'WORD: {line[0]}')
            # print(f'EMB: {line[1:]} \nTYPE: {type(line[1:])}')
            # break
            try:
                dictionary[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue
                
load_embedding(words, 'glove.6B.100d.txt')   
print(len(words))

400000


In [53]:
# Test sentence
sentence_ = 'bihday your majesty'

# Split sentence into words
tokenizer = nltk.RegexpTokenizer(r'\w+') # Split on words
print(f'TOKENS: {tokenizer.tokenize(sentence_)}')

# Take stem of a word
lemmatizer = WordNetLemmatizer()

def to_tokens(sentences: str, vector_words: dict=words) -> list:
    t = tokenizer.tokenize(sentences)
    t_lower = [s.lower() for s in t]
    t_lem = [lemmatizer.lemmatize(s) for s in t_lower]
    t = [s for s in t_lem if s in words]
    return t

print(f'STANDARD TOKENS: {to_tokens(sentence_)}')


TOKENS: ['bihday', 'your', 'majesty']
STANDARD TOKENS: ['your', 'majesty']


In [55]:
# Word embed
def embed(sentences: str, vector_words: dict=words) -> np.array:
    tokens = to_tokens(sentences)
    vectors = []

    for token in tokens:
        if token not in vector_words:
            continue

        token_vector = vector_words[token]
        vectors.append(token_vector)

    return np.array(vectors, dtype=float)

print(embed(sentence_).shape)
print(embed(sentence_))

(2, 50)
[[-2.9163e-02  8.1769e-01  3.8470e-01 -7.7857e-01  1.1049e+00 -1.3655e-01
  -2.4691e-02 -5.1103e-02  7.7950e-01  5.1357e-02 -3.5748e-01  1.1748e+00
  -9.8244e-02  3.3111e-01  4.0426e-01  5.8685e-01 -6.2536e-01  9.4833e-02
   9.7024e-01 -1.1437e+00  1.3826e-01  2.8136e-01  4.6693e-01  3.5226e-01
   6.8916e-01 -1.9819e+00 -1.4000e+00  1.7001e-01  1.5929e+00 -1.0086e+00
   3.6499e+00  1.3949e+00 -7.8823e-01  4.0404e-01 -3.6925e-01  7.3075e-01
   2.7513e-02 -1.1993e-01  7.3716e-01 -1.0365e+00  6.8659e-01 -3.0294e-01
  -5.5175e-01  9.6466e-01  5.3103e-02 -8.4807e-02  8.5120e-01 -5.4186e-01
   3.2453e-01  5.8425e-01]
 [ 1.6999e-01  9.4964e-01 -1.1559e+00 -6.6555e-01  6.5813e-01 -1.0987e+00
   8.3952e-01  6.2359e-01  3.4939e-01 -6.4611e-01  2.7352e-01  1.7612e+00
   4.6555e-01 -1.6568e-01 -4.8375e-02  3.3241e-01 -7.8166e-01  4.0905e-01
   2.1636e-01  1.8103e-01  8.2853e-01  5.2009e-01  3.6097e-02 -5.1258e-01
   3.1427e-01 -1.2165e+00 -1.0347e+00  3.7987e-02 -8.5888e-02  6.2726e-01
   