# Workflow:
#### 1. Import Data
#### 2. Prepare the input data
#### 3. Import pre-trained W2V
#### 4. Create Neural Network Pipeline
#### 5. Train The Model
#### 6. Evaluate results

<br>
____________________________________________________________________________________________________________________________

### 1. Import Data

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_path = r"https://raw.githubusercontent.com/lukasgarbas/nlp-text-emotion/master/data/data_train.csv"
test_path = r"https://raw.githubusercontent.com/lukasgarbas/nlp-text-emotion/master/data/data_test.csv"

In [3]:
data_train = pd.read_csv(train_path, encoding='utf-8')
data_test = pd.read_csv(test_path, encoding='utf-8')

#### Checking Data

In [4]:
data_train.head(2)

Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."


In [5]:
data_test.head(2)

Unnamed: 0,Emotion,Text
0,sadness,I experienced this emotion when my grandfather...
1,neutral,"when I first moved in , I walked everywhere ...."


#### Chekcing Null Values

In [6]:
data_train.isna().sum()

Emotion    0
Text       0
dtype: int64

In [7]:
data_test.isna().sum()

Emotion    0
Text       0
dtype: int64

#### Value Counts

In [8]:
data_train.Emotion.value_counts()

sadness    1641
joy        1619
neutral    1616
anger      1566
fear       1492
Name: Emotion, dtype: int64

In [9]:
data_test.Emotion.value_counts()

joy        707
anger      693
fear       679
sadness    676
neutral    638
Name: Emotion, dtype: int64

#### Train and Test

In [10]:
X_train = data_train.Text
X_test = data_test.Text

y_train = data_train.Emotion
y_test = data_test.Emotion

#### Merging Train and Test data

In [11]:
data = data_train.append(data_test, ignore_index=True)
data.head()

Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...


#### Variable Initialization

In [12]:
# Number of Labels: joy, anger, fear, sadness, neutral
num_classes = 5

# Number of dimenstion for word embedding
embed_num_dims = 300

# Max input length (max num of words)
max_seq_len = 800

class_names = ['joy', 'anger', 'fear', 'sadness', 'neutral']

<br>
____________________________________________________________________________________________________________________________

### 2. Prepare Input Data

- Tokenize our texts and count unique tokens
- Padding: each input (sentence or text) has to be of the same lenght
- Labels have to be converted to integeres and categorized

In [13]:
from nltk.tokenize import word_tokenize
def clean_data(data):
    
    # Removing the unwanted @ and #
    data = re.sub(r"(#[\d\w\.]+)", '', data)
    data = re.sub(r"(@[\d\w\.]+)", '', data)
    
    # tekenization using nltk
    data = word_tokenize(data)
    
    return data

In [14]:
texts = [' '.join(clean_data(text)) for text in data.Text]

texts_train = [' '.join(clean_data(text)) for text in X_train]
texts_test = [' '.join(clean_data(text)) for text in X_test]

In [15]:
print(texts_train[100])

Playing NOW on Hardest : BYZPO Radio Show Session Tune in , listen and enjoy .


#### Tokenization + fitting using keras

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tok = Tokenizer()
tok.fit_on_texts(texts)

seq_train = tok.texts_to_sequences(texts_train)
seq_test = tok.texts_to_sequences(texts_test)

index_of_words = tok.word_index

# Vocab size is number of unique words + reserved 0 index of padding
voc_size = len(index_of_words)+1

print(f"Number of unique words:{len(index_of_words)}")

Number of unique words:12087


#### Padding: each input has the same length 

In [17]:
X_train_pad = pad_sequences(seq_train, maxlen=max_seq_len)
X_test_pad = pad_sequences(seq_test, maxlen=max_seq_len)

In [18]:
X_train_pad

array([[    0,     0,     0, ...,   119,    51,   345],
       [    0,     0,     0, ...,    37,   277,   154],
       [    0,     0,     0, ...,    16,     2,  1210],
       ...,
       [    0,     0,     0, ...,   876,     4,   909],
       [    0,     0,     0, ...,     1,     6,   117],
       [    0,     0,     0, ..., 10258,   173,    13]])

In [19]:
X_test_pad

array([[    0,     0,     0, ...,   397,   141,   120],
       [    0,     0,     0, ...,   172,   663, 10259],
       [    0,     0,     0, ...,     5,   389,   582],
       ...,
       [    0,     0,     0, ...,    12,   194,    23],
       [    0,     0,     0, ...,   106,    16,    59],
       [    0,     0,     0, ...,     9,     2,   534]])

#### Categorize Labels

In [20]:
encoding = {'joy':0, 'fear': 1, 'anger': 2, 'sadness': 3, 'neutral': 4}

# Integer Lables
y_train = [encoding[x] for x in data_train.Emotion]
y_test = [encoding[x] for x in data_test.Emotion]

In [21]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [22]:
y_train

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [23]:
y_test

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

### Import pretrained word vectors
- Importing pretrained word2vec from file and creating embedding matrix
- We will later map each word in our corpus to existing word vector

In [24]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    voc_size = len(word_index)+1 # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((voc_size, embedding_dim))
    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float)[:embedding_dim]
    return embedding_matrix

In [42]:
# import urllib.request
# import zipfile
# import os

# fname = 'wiki-news-300d-1M.vec'

# if not os.path.isfile(fname):
#     print('Downloading word vectors...')
#     urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip',
#                               'wiki-news-300d-1M.vec.zip')
#     print('Unzipping...')
#     with zipfile.ZipFile('wiki-news-300d-1M.vec.zip', 'r') as zip_ref:
#         zip_ref.extractall('embeddings')
#     print('done.')
    
#     os.remove('wiki-news-300d-1M.vec.zip')

In [41]:
embedd_matrix = create_embedding_matrix(fname, index_of_words, embed_num_dims)
embedd_matrix.shape

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 8044: character maps to <undefined>