In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Main.csv')

In [3]:
# Assign columns to variables
y = data['class']
y = y.values
x = data['tweet']


In [4]:
# y = np.zeros((Y.shape[0],3))
# for i,output in enumerate(Y):
#     if output==0:
#         y[i][0]=1
#     elif output==1:
#         y[i][1]=1
#     else:
#         y[i][2]=1


In [5]:
y[0:10]

array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import re

In [8]:
REPLACE_BY_SPACE_RE = re.compile('[(){}\[\]\|,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z<USR><URL><RT> #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    tokens = []
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
#     text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwors from text
    
    for token in text.split():
        if token.startswith('@'):
                    token = '<USR>'
        elif token.startswith('http://') or token.lower().startswith('https://'):
                    token = '<URL>'
        elif token=="rt":
            token='<RT>'
        token = BAD_SYMBOLS_RE.sub('',token)
        tokens.append(token)
    return tokens

In [9]:
x = [text_prepare(str(line)) for line in x]

In [10]:
x[0:10]

[['',
  '<RT>',
  '<USR>',
  'woman',
  'complain',
  'cleaning',
  'house',
  'amp',
  'man',
  'always',
  'take',
  'trash',
  'out'],
 ['',
  '<RT>',
  '<USR>',
  'boy',
  'dats',
  'coldtyga',
  'dwn',
  'bad',
  'cuffin',
  'dat',
  'hoe',
  '1st',
  'place'],
 ['',
  '<RT>',
  '<USR>',
  'dawg',
  '<RT>',
  '<USR>',
  'ever',
  'fuck',
  'bitch',
  'start',
  'cry',
  'confused',
  'shit'],
 ['', '<RT>', '<USR>', '<USR>', 'look', 'like', 'tranny'],
 ['',
  '<RT>',
  '<USR>',
  'shit',
  'hear',
  'might',
  'true',
  'might',
  'faker',
  'bitch',
  'told',
  'ya',
  '#57361'],
 ['t_madison_x',
  'shit',
  'blows',
  'meclaim',
  'faithful',
  'somebody',
  'still',
  'fucking',
  'hoes',
  '#128514',
  '#128514',
  '#128514',
  ''],
 ['__brighterdays',
  'sit',
  'hate',
  'another',
  'bitch',
  '',
  'got',
  'much',
  'shit',
  'going',
  'on'],
 ['#8220',
  '<USR>',
  'cause',
  'im',
  'tired',
  'big',
  'bitches',
  'coming',
  'us',
  'skinny',
  'girls#8221'],
 ['', 'a

In [11]:
from sklearn.datasets.samples_generator import make_blobs
from numpy import where
from matplotlib import pyplot
# generate dataset
X, Y = make_blobs(n_samples=1000, centers=3, n_features=2, cluster_std=2, random_state=2)
# select indices of points with each class label
for i in range(3):
	samples_ix = where(Y == i)
	pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1])
pyplot.show()

<Figure size 640x480 with 1 Axes>

In [12]:
from collections import defaultdict

In [13]:
def build_dict(tokens):
    tok2idx = defaultdict(lambda: 0)
    idx2tok = defaultdict(lambda: 0)

    vocab = set([t for singleTweet in tokens for t in singleTweet])

    tok2idx['<UNK>'] = 0
    idx2tok[0] = '<UNK>' 
    tok2idx['<PAD>'] = 1
    idx2tok[1] = '<PAD>'
    
    for i, token in enumerate(vocab, 2):
        tok2idx[token] = i
        idx2tok[i] = token
        
    
    return tok2idx, idx2tok

In [14]:
token_to_index, index_to_token = build_dict(x)

In [15]:
tweet_ints = []
for tweet in x:
    tweet_ints.append([token_to_index[str(word[1:-2])] for word in str(tweet).split()])

In [16]:
def pad_features(tweet_ints, seq_length):
    
    features = np.ones([len(tweet_ints), seq_length], dtype=np.int32) * token_to_index['<PAD>']

    # for each review, I grab that review and 
    for i, row in enumerate(tweet_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [17]:
x = pad_features(tweet_ints,20) # Padding token index = 1.

In [18]:
len(x)

24783

In [19]:
split_frac = 0.8
split_idx = int(len(x)*split_frac)

train_x, remaining_x = x[:split_idx], x[split_idx:]
train_y, remaining_y = y[:split_idx], y[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of the resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(19826, 20) 
Validation set: 	(2478, 20) 
Test set: 		(2479, 20)


In [20]:
from torch.utils.data import TensorDataset, DataLoader

In [21]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [22]:
data_iter = iter(train_loader)
sample_x, sample_y = data_iter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([64, 20])
Sample input: 
 tensor([[    1,     1,     1,  ..., 16389,  1026, 18114],
        [    1,     1,     1,  ..., 22661, 22227, 12941],
        [    0, 13944, 12824,  ..., 10517, 24704, 13825],
        ...,
        [    0,   389,  3649,  ..., 21656, 15628, 15628],
        [    1,     1,     1,  ..., 24483,  3112, 19225],
        [    1,     1,     1,  ...,  9637,  1853,  1451]], dtype=torch.int32)

Sample label size:  torch.Size([64])
Sample label: 
 tensor([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
        1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1])


In [23]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [24]:
import torch.nn as nn
import torch.nn.functional as F


In [76]:
class HateRNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_size, hidden_size, n_layers, drop_prob=0.8):
        
        super(HateRNN, self).__init__()
        
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout = drop_prob, batch_first = True)
        
        self.dropout = nn.Dropout(0.5)
        
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        # Create embeddings and pass them through the LSTM cell
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # Stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_size)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
#         out = F.relu(out)
        out = self.softmax(out)
        
        out = out.view(batch_size,-1)
        out = out[:,-1]
        
        return out, hidden
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_size,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_size).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_size).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_size).zero_())
        
        return hidden
        




In [77]:
vocab_size = len(token_to_index)
output_size = 3
embedding_size = 512
hidden_size = 256
n_layers = 2
drop_prob = 0.8

net = HateRNN(vocab_size, output_size, embedding_size, hidden_size, n_layers, drop_prob)

print(net)

HateRNN(
  (embedding): Embedding(29074, 512)
  (lstm): LSTM(512, 256, num_layers=2, batch_first=True, dropout=0.8)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (softmax): Softmax()
)


In [78]:
# loss and optimization functions
lr=0.001
epochs = 4
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [79]:
counter = 0
clip = 5
print_every = 100

if(train_on_gpu):
    net.cuda()

net.train()
for e in range(epochs):
    h = net.init_hidden(batch_size)
    
    # Batch Loop
    for inputs, labels in train_loader:
        counter+=1
        
        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()
        
        # We create a new variable so we don't backprop through the entire history of h 
        h = tuple([each.data for each in h])
        
        # Zero accumulated grads
        net.zero_grad()
        
        # Get the output from the model
        output, h = net(inputs, h)
        
        # Get the loss and backprop
        print(output.shape)
        print(labels.shape)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # Loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
        




torch.Size([64])
torch.Size([64])


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)