In [34]:
import os
import csv
import time
import spacy
import pickle
import random
import numpy as np
import pandas as pd

import torch
import torch.optim as optim

# use torchtext to load train, test, pretrainede datasets
from torchtext import data
from torchtext import datasets
from torchtext.data import TabularDataset

In [35]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### build csv from text for training by torchtext

In [36]:
# train list, label list
print("Loading dataset...")
fp = open("datasets/train_pos_tidy.txt", "r")
train_pos = fp.readlines()
train_pos = train_pos[:5000]
label_pos = ['1' for x in range(len(train_pos))] # positive label:1
print(len(train_pos), "positive tweets loaded")
fp.close()
fn = open("datasets/train_neg_tidy.txt", "r")
train_neg = fn.readlines()
train_neg = train_neg[:5000]
label_neg = ['0' for x in range(len(train_neg))] # negative label:0
print(len(train_neg), "negative tweets loaded\n")
fn.close()
train_data = train_pos + train_neg
train_label = label_pos + label_neg   

# key in dictionary is the column name in csv
dataframe = pd.DataFrame({'text':train_data,'label':train_label})
# save DataFrame as csv
dataframe.to_csv("datasets/train.csv",index=False,sep=',')

Loading dataset...
5000 positive tweets loaded
5000 negative tweets loaded



### build training input by torchtext

In [37]:
spacy_en = spacy.load('en')
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = data.Field(sequential=False, use_vocab=False,dtype=torch.float)

trn_datafields = [("text", TEXT),("label", LABEL)]
trn = TabularDataset(
               path="datasets/train.csv", # training data csv path
               format='csv',
               skip_header=True, # skip csv header
               fields=trn_datafields)

In [38]:
print(trn[0].__dict__.keys())
print(vars(trn.examples[0]))
print(trn[0].text[:])
print(trn[5000].label[:])

dict_keys(['text', 'label'])
{'text': ['dunno', 'justin', 'read', 'mention', 'not', 'only', 'justin', 'and', 'god', 'know', 'but', 'hope', 'you', 'follow', '#', 'believe'], 'label': '1'}
['dunno', 'justin', 'read', 'mention', 'not', 'only', 'justin', 'and', 'god', 'know', 'but', 'hope', 'you', 'follow', '#', 'believe']
0


In [39]:
# split data: training, validation 
train_data, valid_data = trn.split(random_state=random.seed(SEED),split_ratio=0.7) # 70%:30%
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 7000
Number of validation examples: 3000


In [22]:
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.twitter.27B.100d")
# load pre-trained emmbedding text
# here we use the text name limited in build_vocab, like 'charngram.100d', 'fasttext.en.300d'

In [40]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
#print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
#print(LABEL.vocab.stoi)

AttributeError: 'Field' object has no attribute 'vocab'

In [41]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size=BATCH_SIZE, 
    device=device,
    sort_key=lambda x: len(x.text),
    sort_within_batch=False,
    repeat=False)

In [25]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [26]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [27]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([10198, 100])


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0738,  0.2296,  0.1619,  ..., -0.5719,  0.5669, -0.0974],
        ...,
        [ 0.4244,  0.1037, -1.3166,  ...,  0.4221,  0.4951,  0.2900],
        [-0.2551, -0.4357,  0.0260,  ..., -0.4315, -0.0373, -0.2629],
        [ 0.5168, -0.2330,  0.3913,  ...,  0.5349, -0.5232, -0.0990]])

In [28]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [29]:
class Instructor():
# modle process: train, evaluate, save, load, predict
    def __init__(self,model, optimizer, criterion):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
    def train(self,iterator):

        epoch_loss = 0
        epoch_acc = 0

        self.model.train()

        for batch in iterator:

            self.optimizer.zero_grad()

            predictions = model(batch.text).squeeze(1)

            loss = self.criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            loss.backward()

            self.optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def evaluate(self,iterator):

        epoch_loss = 0
        epoch_acc = 0

        self.model.eval()

        with torch.no_grad():

            for batch in iterator:

                predictions = model(batch.text).squeeze(1)

                loss = self.criterion(predictions, batch.label)

                acc = binary_accuracy(predictions, batch.label)

                epoch_loss += loss.item()
                epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    def save(self,state,dir):
        if not os.path.isdir('model_trained'): #find the file for model saving and loading
            os.mkdir('model_trained')
        torch.save(state, dir)
        print('--- Save last model state')

    def load(self,dir):
        if not os.path.isdir('model_trained'): #find the file for model saving and loading
            os.mkdir('model_trained')
        checkpoint = torch.load(dir)
        self.model.load_state_dict(checkpoint['net'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch'] + 1
        print('--- Load last model state')
        print('start epoch:',start_epoch)
        return start_epoch

In [30]:

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
instructor =  Instructor(model, optimizer, criterion)#class Instructor for model processing

epoch_start = 1



In [31]:
N_EPOCHS = 15
for epoch in range(epoch_start,epoch_start+N_EPOCHS):

    train_loss, train_acc = instructor.train(train_iterator)
    valid_loss, valid_acc = instructor.evaluate(valid_iterator)
    
    print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
epoch_start = epoch_start+N_EPOCHS

| Epoch: 01 | Train Loss: 0.562 | Train Acc: 69.23% | Val. Loss: 0.497 | Val. Acc: 74.44% |
| Epoch: 02 | Train Loss: 0.478 | Train Acc: 75.97% | Val. Loss: 0.470 | Val. Acc: 76.11% |
| Epoch: 03 | Train Loss: 0.444 | Train Acc: 78.08% | Val. Loss: 0.477 | Val. Acc: 76.32% |
| Epoch: 04 | Train Loss: 0.417 | Train Acc: 79.57% | Val. Loss: 0.444 | Val. Acc: 78.63% |
| Epoch: 05 | Train Loss: 0.386 | Train Acc: 81.56% | Val. Loss: 0.440 | Val. Acc: 79.37% |
| Epoch: 06 | Train Loss: 0.354 | Train Acc: 83.68% | Val. Loss: 0.445 | Val. Acc: 79.07% |
| Epoch: 07 | Train Loss: 0.326 | Train Acc: 85.45% | Val. Loss: 0.436 | Val. Acc: 79.00% |
| Epoch: 08 | Train Loss: 0.299 | Train Acc: 86.70% | Val. Loss: 0.438 | Val. Acc: 79.89% |
| Epoch: 09 | Train Loss: 0.286 | Train Acc: 87.52% | Val. Loss: 0.470 | Val. Acc: 79.73% |
| Epoch: 10 | Train Loss: 0.264 | Train Acc: 88.72% | Val. Loss: 0.528 | Val. Acc: 77.98% |
| Epoch: 11 | Train Loss: 0.225 | Train Acc: 90.55% | Val. Loss: 0.483 | Val. Ac

In [24]:
# Save last model state
state = {'net':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
dir = './model_trained/LSTM_version_2.' #model path
instructor.save(state,dir)

--- Save last model state


In [38]:
# Load last model state
dir = './model_trained/LSTM_version_1.' #model path
epoch_start = instructor.load(dir)

--- Load last model state
start epoch: 6


In [20]:
#test predition
print("Loading dataset...")
fp = open("datasets/test_data.txt", "r")
test_data = fp.readlines()
print(len(test_data), "test tweets loaded")
fp.close()

def predict_sentiment(sentence):
    nlp = spacy.load('en')
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    if prediction.item() <= 0.5:
        pred = -1
    else:
        pred = 1 
    return pred

Loading dataset...
10000 test tweets loaded


In [22]:
out_dir = './out/'
localtime = time.asctime(time.localtime(time.time()))
fp = open(out_dir + "submission " + localtime[4:-5] + ".csv", "w")
fieldnames = ['Id', 'Prediction']
writer = csv.DictWriter(fp, fieldnames=fieldnames)
writer.writeheader()

print("Generating predictions...\n")

for tweet in test_data:
    i, t = tweet.split(",", maxsplit=1)  # Splitting the index from the tweet text
    prediction = predict_sentiment(t)
    writer.writerow({'Id': str(i), 'Prediction': str(prediction)})
fp.close()

print("Done.")

Generating predictions...

Done.


In [2]:
torch.cuda.is_available()

True