In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import warnings
import pickle
import time
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_validate, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

warnings.filterwarnings('ignore')

In [4]:
# Load data
train = pd.read_csv('../dsets/nlp_tweet/train.csv', index_col='id')
test = pd.read_csv('../dsets/nlp_tweet/test.csv', index_col='id')

In [5]:
print(train.shape, test.shape)

(7613, 4) (3263, 3)


In [6]:
keyword_train = train.keyword.unique()[1:]
keyword_test = test.keyword.unique()[1:]

In [7]:
keyword_train = list(map(lambda x: x.replace('%20', ' '), keyword_train))
keyword_test = list(map(lambda x: x.replace('%20', ' '), keyword_test))
keywords = pd.DataFrame({'train': keyword_train, 'test': keyword_test})

In [8]:
train.drop(['keyword', 'location'], inplace=True, axis=1)
test.drop(['keyword', 'location'], inplace=True, axis=1)

In [9]:
wnl = WordNetLemmatizer()

def normalize_text(text):
    text = text.lower()
    to_rem = re.compile(r'https?://\S+|www\.\S+|[^A-Za-z0-9]|\s+')
    rem_space = re.compile(r'\s{2,}')
    return rem_space.sub(' ', to_rem.sub(' ', text))

def lemmatize_sentence(sentence):
    words = sentence.split(' ')
    new_sentence_word = list()

    for word in words:
        new_word = wnl.lemmatize(word, wordnet.VERB)
        new_sentence_word.append(new_word)

    new_sentence = ' '.join(new_sentence_word)
    new_sentence = new_sentence.strip()

    return new_sentence

def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False):
    dataset = Make_Dataset(dataset)
    dataset_iter = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataset_iter

def prepare_csv(train, test, seed=42, ratio=0.2):
    idx = np.arange(train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)

    val_size = int(len(idx)*ratio)

    train.iloc[idx[val_size:], :][['target', 'text']].to_csv('./prep_train.csv')

    train.iloc[idx[:val_size], :][['target', 'text']].to_csv('./prep_val.csv')

    test.to_csv('./test.csv')

In [10]:
train['text'] = train['text'].apply(lambda x: normalize_text(x))
test['text'] = test['text'].apply(lambda x: normalize_text(x))

train['text'] = train['text'].apply(lambda x: lemmatize_sentence(x))
test['text'] = test['text'].apply(lambda x: lemmatize_sentence(x))

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable

from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from collections import Counter
    
from torchtext.vocab import Vectors, GloVe, Vocab

In [12]:
class Make_Dataset(Dataset):
    def __init__(self, csv_file, transform=None, target_transform=None):
        self.data = csv_file
        self.text_data = csv_file['text']
        self.target = csv_file['target']
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, item):
        return self.data['text'].iloc[item], self.data['target'].iloc[item]
        

In [13]:
train

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,our deeds be the reason of this earthquake may...,1
4,forest fire near la ronge sask canada,1
5,all residents ask to shelter in place be be no...,1
6,13 000 people receive wildfires evacuation ord...,1
7,just get send this photo from ruby alaska as s...,1
...,...,...
10869,two giant crane hold a bridge collapse into ne...,1
10870,aria ahrary thetawniest the out of control wil...,1
10871,m1 94 01 04 utc 5km s of volcano hawaii,1
10872,police investigate after an e bike collide wit...,1


In [14]:
words = []
stop_word = set(stopwords.words('english'))

for text in train['text']:
    tokenized = word_tokenize(text)
    for i in tokenized:
        if i not in stop_word and i not in ['1','2','3','4','5','6','7','8','9','0']:
            words.append(i)

In [15]:
corpus = Counter(words)
corpus_ = sorted(corpus,key=corpus.get,reverse=True)
ohe = {w:i+1 for i,w in enumerate(corpus_)}

In [16]:
final_list_train = []
final_list_test = []

for sent in train['text']:
    final_list_train.append([ohe[word] for word in sent.split() if word in ohe.keys()])

for sent in test['text']:
    final_list_test.append([ohe[word] for word in sent.split() if word in ohe.keys()])

In [17]:
def padding_(sentences, seq_len):
    for i, text in enumerate(sentences):
        
        features = np.zeros(seq_len,dtype=int)
        for j, t in enumerate(text):
            features[j] = t
        sentences[i] = features
    return sentences

In [18]:
padded_train = padding_(final_list_train, 200)
padded_test = padding_(final_list_test, 200)

In [19]:
Xtrain_tensor = torch.LongTensor(padded_train)
Ytrain_tensor = torch.LongTensor(np.array(train['target']))

val_x = Xtrain_tensor[-13:]
val_y = Ytrain_tensor[-13:]

Xtrain_tensor = Xtrain_tensor[:-13]
Ytrain_tensor = Ytrain_tensor[:-13]

test_tensor = torch.LongTensor(padded_test)

In [20]:
len(Xtrain_tensor)

7600

In [21]:
from torch.utils.data import TensorDataset
train_loader = DataLoader(TensorDataset(Xtrain_tensor, Ytrain_tensor), batch_size=50, shuffle=True)
test_loader = DataLoader(TensorDataset(test_tensor), batch_size=1, shuffle=True)
valid_loader = DataLoader(TensorDataset(Xtrain_tensor, Ytrain_tensor), batch_size=50, shuffle=True)

In [22]:
device = 'cuda'
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,output_dim,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim

        self.no_layers = no_layers
        self.vocab_size = vocab_size

        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                            num_layers=no_layers, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)


        sig_out = sig_out[:, -1] # get last batch of labels

        # return last sigmoid output and hidden state
        return sig_out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden 

In [23]:
layers = 2
vocab_size = len(ohe)+1
embedding_dim = 256
output_dim = 2

model = SentimentRNN(layers, vocab_size, output_dim, 512, embedding_dim)
model.to(device)
print(model)

SentimentRNN(
  (embedding): Embedding(14894, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (sig): Sigmoid()
)


In [24]:
lr=0.001

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [26]:
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

clip = 5
epochs = 20
valid_loss_min = np.Inf
batch_size = 50
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:

        inputs, labels = inputs.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        model.zero_grad()
        output,h = model(inputs,h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
        val_h = tuple([each.data for each in val_h])

        inputs, labels = inputs.to(device), labels.to(device)

        output, val_h = model(inputs, val_h)
        val_loss = criterion(output.squeeze(), labels.float())

        val_losses.append(val_loss.item())

        accuracy = acc(output,labels)
        val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
    if epoch_val_loss <= valid_loss_min:
        torch.save(model.state_dict(), 'state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
        valid_loss_min = epoch_val_loss
    print(25*'==')

Epoch 1
train_loss : 0.6833026577767572 val_loss : 0.6831850876149378
train_accuracy : 57.131578947368425 val_accuracy : 57.131578947368425
Validation loss decreased (inf --> 0.683185).  Saving model ...
Epoch 2
train_loss : 0.6833765871430698 val_loss : 0.6837193111055776
train_accuracy : 57.131578947368425 val_accuracy : 57.131578947368425
Epoch 3
train_loss : 0.6834538790740465 val_loss : 0.6833664719995699
train_accuracy : 57.131578947368425 val_accuracy : 57.131578947368425
Epoch 4
train_loss : 0.6833884025874891 val_loss : 0.6829421837863169
train_accuracy : 57.131578947368425 val_accuracy : 57.131578947368425
Validation loss decreased (0.683185 --> 0.682942).  Saving model ...
Epoch 5
train_loss : 0.6832401823056372 val_loss : 0.6830517759448603
train_accuracy : 57.131578947368425 val_accuracy : 57.131578947368425
Epoch 6
train_loss : 0.6833346352765435 val_loss : 0.6831266648675266
train_accuracy : 57.131578947368425 val_accuracy : 57.131578947368425
Epoch 7
train_loss : 0.6831

In [32]:

def predict(padded):
    inputs = padded.to(device)
    batch_size = 1
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    output, h = model(inputs, h)
    print(output.item())

In [35]:
y = predict(test_tensor[6:7])

0.43303418159484863


In [None]:
padded_test[0]