In [22]:
# Imports
# General
import pandas as pd
import numpy as np
from copy import deepcopy

# NLP
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Charts
import matplotlib.pyplot as plt

# Pytorch - ML
import torch
import torch.nn as nn

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\48694\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
_train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

split_part = int(len(_train_df) * 0.85)
val_df = _train_df[split_part:]
train_df = _train_df[:split_part]

train_df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
27162,27163,0,#model i love u take with u all the time in ...
27163,27164,0,how to create the life you want #success #lif...
27164,27165,0,be being you! thank you @user for this inspi...
27165,27166,0,maggie is just as as me for my #allages #cdr...


In [3]:
train_df['label'].value_counts()

label
0    25265
1     1902
Name: count, dtype: int64

# NLP

In [4]:
# Load word vectors
words = dict()

def load_embedding(dictionary: dict, filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for lines in f.readlines():
            line = lines.split(' ')
            # print(f'WORD: {line[0]}')
            # print(f'EMB: {line[1:]} \nTYPE: {type(line[1:])}')
            # break
            try:
                dictionary[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue
                
load_embedding(words, 'glove.6B.100d.txt')
VEC_DIM = words['the'].shape[0]
print(len(words))
print(f'Sequence length: {VEC_DIM}')

400000
Sequence length: 100


In [5]:
# Test sentence
sentence_ = 'bihday your majesty'

# Split sentence into words
tokenizer = nltk.RegexpTokenizer(r'\w+') # Split on words
print(f'TOKENS: {tokenizer.tokenize(sentence_)}')

# Take stem of a word
lemmatizer = WordNetLemmatizer()

def to_tokens(sentences: str, vector_words: dict=words) -> list:
    """
    Splits string into list of strings and standardized every word.
    """
    t = tokenizer.tokenize(sentences)
    t_lower = [s.lower() for s in t]
    t_lem = [lemmatizer.lemmatize(s) for s in t_lower]
    t = [s for s in t_lem if s in words]
    return t

print(f'STANDARD TOKENS: {to_tokens(sentence_)}')


TOKENS: ['bihday', 'your', 'majesty']
STANDARD TOKENS: ['your', 'majesty']


In [23]:
# Word embed
def embed(sentences: str, vector_words: dict=words) -> np.array:
    """
    Transforms sequences into list of embedded words.
    """
    tokens = to_tokens(sentences)
    vectors = []

    for token in tokens:
        if token not in vector_words:
            continue

        token_vector = vector_words[token]
        vectors.append(token_vector)

    return torch.tensor(vectors, dtype=torch.float32).unsqueeze(0)

print(embed(sentence_).shape)
print(embed(sentence_))

torch.Size([1, 2, 100])
tensor([[[-0.5718,  0.0463,  0.8673, -0.5903, -0.6493,  0.6588, -0.8279,
           0.2257, -0.0198,  0.2122,  0.3753,  0.1748,  0.2859,  0.2292,
          -0.1048, -0.3686, -0.0976,  0.5299, -0.0240,  0.4108, -0.6807,
          -0.1125, -0.3501, -0.2945,  0.3823,  0.9398, -0.7363, -1.0142,
           0.6061, -0.4646,  0.8398,  1.2551,  0.4999,  0.0418, -0.3125,
           0.3882, -0.6621,  0.0026,  0.6252, -0.7410,  0.5719, -0.3599,
           0.2182, -0.7801, -0.7755,  0.1330, -0.9260, -0.3226, -0.0252,
          -1.3961,  0.2293,  0.2695, -0.1612,  0.8521, -0.0786, -2.2859,
           0.8719,  0.2970,  2.3800, -0.1992,  0.5504,  1.6426, -0.3106,
          -0.0293,  0.9394,  0.5702,  0.5862, -0.2594,  0.3897, -0.5606,
           0.0991,  0.4679,  0.5448, -0.9599,  0.6297,  0.4391, -0.2858,
          -0.4836, -0.4151,  0.1803,  0.4332,  0.5131, -0.3704, -0.1585,
          -1.5992, -0.2138, -0.1418, -0.1423, -0.1247, -0.0529, -0.6274,
           0.8598,  1.2702,

  return torch.tensor(vectors, dtype=torch.float32).unsqueeze(0)


In [7]:
# Transform into X and y for ML
def transform_X_y(df: pd.DataFrame):
    """
    Takes dataframe and splits it into arguments and targets for ML algorithm.
    """
    y = df['label'].to_numpy().astype(int)

    X = []
    for message in df['tweet']:
        vectorized_message = embed(message)

        if vectorized_message.shape[0] == 0:
            vectorized_message = np.zeros(shape=(1, VEC_DIM)) # for now it is as single word

        X.append(vectorized_message)

    return X, np.array(y)


X, y = transform_X_y(train_df)

In [8]:
# Create padding to make sequences the same length
def pad_X(ls: list, length: int):
    ls_copy = deepcopy(ls)

    for i, x in enumerate(ls):
        x_len = x.shape[0]

        if x_len > length:
            ls_copy[i] = x[:length]
        else:
            seq_diff = length - x_len
            paddding = np.zeros(shape=(seq_diff, VEC_DIM))
            ls_copy[i] = np.concatenate([x, paddding])
        
    return np.array(ls_copy).astype(float)

X = pad_X(X, 57)
print("X:", X.shape)
print("y:", y.shape)

X: (27167, 57, 100)
y: (27167,)


In [11]:
# Validation
X_val, y_val = transform_X_y(val_df)
X_val = pad_X(X_val, 57)

# # Test ---- Test do not have label column
# X_test, y_val = transform_X_y(test_df)
# X_test = pad_X(X_test, 57)


# ML Model

In [45]:
class Sentiment(nn.Module):
    def __init__(self, embedding_dim, hidden, layers, dropout):
        super().__init__()

        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden,
                            num_layers=layers,
                            dropout=dropout)
        self.linear = nn.Linear(in_features=hidden, out_features=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = embed(x)
        out, hidden = self.lstm(out)
        out = self.linear(out[:, -1, :])
        return self.sigmoid(out)
        
model = Sentiment(100, 64, 4, 0.2)

torch.Size([1, 1])
tensor([[0.5039]], grad_fn=<SigmoidBackward0>)
