In [9]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv('Data/cleaned_data.csv')
df.head()

Unnamed: 0,label,sentences,lemmatized,tokenized_sentences
0,0,father dysfunctional selfish drag kid dysfunct...,"['father', 'dysfunctional', 'selfish', 'drag',...",['father dysfunctional selfish drag kid dysfun...
1,0,thanks lyft credit use cause offer wheelchair ...,"['thanks', 'lyft', 'credit', 'use', 'cause', '...",['thanks lyft credit use cause offer wheelchai...
2,0,bihday majesty,"['bihday', 'majesty']",['bihday majesty']
3,0,model love u take u time ur,"['model', 'love', 'u', 'take', 'u', 'time', 'ur']",['model love u take u time ur']
4,0,factsguide society motivation,"['factsguide', 'society', 'motivation']",['factsguide society motivation']


In [13]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

### Tokenizing with a `BertTokenizer`

Tweets will be tokenized to create two input tensors; input IDs, and attention mask.

Tensors will be contained within two numpy arrays, which will be of dimensions `len(df) * 512` - the `512` is the sequence length of tokenized sequences for BERT, and `len(df)` the number of samples in the dataset.

In [14]:
seq_len = 512
num_samples = len(df)

num_samples, seq_len

(31962, 512)

In [15]:
from transformers import BertTokenizer

In [16]:
# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [17]:
# tokenize - returning Numpy tensors
tokens = tokenizer(df['tokenized_sentences'].to_list(), max_length=seq_len, truncation=True,
                   padding='max_length', add_special_tokens=True,
                   return_tensors='np')

In [18]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [19]:
tokens['input_ids']

array([[101, 164, 112, ...,   0,   0,   0],
       [101, 164, 112, ...,   0,   0,   0],
       [101, 164, 112, ...,   0,   0,   0],
       ...,
       [101, 164, 112, ...,   0,   0,   0],
       [101, 164, 112, ...,   0,   0,   0],
       [101, 164, 112, ...,   0,   0,   0]])

In [20]:
tokens['attention_mask']

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

### Save keys generated by tokenizer as Numpy binary files:

In [11]:
with open('tweets_xids.npy', 'wb') as f:
    np.save(f, tokens['input_ids'])

with open('tweets_xmask.npy', 'wb') as f:
    np.save(f, tokens['attention_mask'])

In [12]:
# Delete the in-memory arrays to free up memory.
del tokens

In [13]:
arr = df['label'].values
arr, arr.shape

(array([0, 0, 0, ..., 0, 1, 0], dtype=int64), (31962,))

In [14]:
arr.max()+1

2

Extract label values and *one-hot* encode them into another numpy array, which will have the dimensions `len(df) * number of label classes`. 

In [15]:
# first extract label column
arr = df['label'].values

In [16]:
# Then initialize a zero array
labels = np.zeros((num_samples, arr.max()+1)) # arr.max()+1 because there are two unique labels and we need our labels array to have one column for each label
labels.shape

(31962, 2)

In [17]:
# use the current values in our `arr` of *\[0, 1]* to place `1` values in the correct positions of the zeros-only array:
labels[np.arange(num_samples), arr] = 1

labels

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

Save one-hot encoded array to a Numpy binary file.

In [18]:
with open('twitter-labels.npy', 'wb') as f:
    np.save(f, labels)