<a href="https://www.kaggle.com/code/citipop/disastertweet?scriptVersionId=134343393" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
import numpy as np 
import pandas as pd 
from collections import Counter
from sklearn.model_selection import train_test_split
import string

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/co

# **Download**

In [3]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

X_train = df_train["text"]
X_test = df_test["text"]
y_train = df_train["target"]

# **Preprocess**

In [4]:
lemmatizer = WordNetLemmatizer()
stw = stopwords.words("english")
pd.options.mode.chained_assignment = None

def preprocess(X):
    i = 0
    for sent in X:
        sent = sent.lower()
        sent = word_tokenize(sent)
        sent = [token for token in sent if token not in string.punctuation]
        sent = [lemmatizer.lemmatize(token) for token in sent if token not in stw]
        sent = [token for token in sent if len(token)>2]
        sent = [token for token in sent if token.isalpha()]
        X.loc[i] = sent
        i+=1
    return X

X_train = preprocess(X_train)

In [5]:
#Pick most only common words and convert minor words to "<unk>"
max_vocab = 1000
all_tokens = [token for tokens in X_train for token in tokens]
common_tokens = set(list(zip(*Counter(all_tokens).most_common(max_vocab)))[0])
vocab = set(token for token in common_tokens)
vocab = ["<pad>"] + ["<unk>"] + sorted(list(vocab))
word2id = dict((word, i) for i, word in enumerate(vocab))
id2word = dict((i, word) for i, word in enumerate(vocab))
X_train = [[token if token in vocab else "<unk>" for token in tokens]for tokens in X_train]

#Padding
X_train = [[word2id.get(token) for token in sent] for sent in X_train]
X_train = torch.nn.utils.rnn.pad_sequence([torch.tensor(id_list) for id_list in X_train], batch_first=True)

In [6]:
X_test = preprocess(X_test)
X_test = [[token if token in vocab else "<unk>" for token in tokens]for tokens in X_test]
X_test = [[word2id.get(token) for token in sent] for sent in X_test]
X_test = torch.nn.utils.rnn.pad_sequence([torch.tensor(id_list) for id_list in X_test], batch_first=True)

# **Datasets**

In [7]:
y_train = list(y_train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [8]:
class Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        return X, y

train_set = Dataset(X_train, y_train)
valid_set =  Dataset(X_val, y_val)
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=64, shuffle=True)

# **Model**

In [9]:
class RNNClassifier(nn.Module):
    def __init__(self, output_size, hidden_size, vocab_size, padding_idx,
                 device, dropout_probability=0.3, bidirectional=False, n_layers=1,
                 embedding_dimension=50, batch_size=32):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.n_layers = n_layers
        self.dropout_probability = dropout_probability
        self.device = device
        self.padding_idx = padding_idx
        
        # We need to multiply some layers by two if the model is bidirectional
        self.input_size_factor = 2 if bidirectional else 1
        
        self.embedding = nn.Embedding(vocab_size, embedding_dimension)
        
        self.rnn = nn.LSTM(
            embedding_dimension,
            self.hidden_size,
            self.n_layers,
            bidirectional=bidirectional,
        )

        self.fc1 = nn.Linear(
            self.hidden_size * self.input_size_factor,
            16,
        )
        self.fc2 = nn.Linear(
            16,
            self.output_size,
        )


    def init_hidden(self):
        h0 = torch.randn(
            self.n_layers * self.input_size_factor,
            self.batch_size,
            self.hidden_size,
        )
        c0 = torch.randn(
            self.n_layers * self.input_size_factor,
            self.batch_size,
            self.hidden_size,
        )
        
        h0 = h0.to(self.device)
        c0 = c0.to(self.device)

        return h0, c0
    
    def apply_rnn(self, embedding_out, lengths):
        packed = pack_padded_sequence(
            embedding_out,
            lengths,
            batch_first=True,
        )
        activations, _ = self.rnn(packed, self.init_hidden())
        activations, _ = pad_packed_sequence(activations, batch_first=True)
        
        indices = (lengths - 1).view(-1, 1).expand(
            activations.size(0), activations.size(2),
        ).unsqueeze(1)
        indices = indices.to(self.device)
        
        activations = activations.gather(1, indices).squeeze(1)
        return activations

    def forward(self, X, return_activations=False):
        batch_size = len(X)
        
        if batch_size != self.batch_size:
            self.batch_size = batch_size

        lengths = torch.LongTensor([len(tokens) for tokens in X])
        lengths, permutation_indices = lengths.sort(0, descending=True)
        
        X = torch.LongTensor(X)
        X = X[permutation_indices].to(self.device)
        
        embedding_out = self.embedding(X)
        
        activations = self.apply_rnn(embedding_out, lengths)

        x = F.dropout(torch.relu(self.fc1(activations)), 0.05)
        x = self.fc2(x)
        out = torch.sigmoid(x)


        permutation_index_pairs = list(zip(
            permutation_indices.tolist(),
            list(range(len(permutation_indices))),
        ))
        reordered_indices = [
            pair[1] for pair
            in sorted(permutation_index_pairs, key=lambda pair: pair[0])
        ]

        if return_activations:
            return out[reordered_indices], x[reordered_indices]

        return out[reordered_indices]

In [10]:
for i, j in train_loader:
    samp_X = i
    break
model(samp_X)

NameError: name 'model' is not defined

In [11]:
dropout_probability = 0.2  
n_rnn_layers = 1  
embedding_dimension = 128  
hidden_size = 64  
is_bidirectional = True 
max_epochs = 10  
learning_rate = 0.001 
batch_size = 64

model = RNNClassifier(
    output_size=2,  
    hidden_size=hidden_size,
    embedding_dimension=embedding_dimension,
    vocab_size=len(vocab),
    padding_idx=word2id['<pad>'],
    dropout_probability=dropout_probability,
    bidirectional=is_bidirectional,
    n_layers=n_rnn_layers,
    device=device,
    batch_size=batch_size,
)

# **Train and Evaluation**

In [12]:
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=learning_rate,
)
scheduler = CosineAnnealingLR(optimizer, 1)

In [13]:
def train_epoch(model, optimizer, scheduler, train_loader):
    model.train()
    total_loss = total = 0
    progress_bar = tqdm_notebook(train_loader, desc='Training', leave=False)
    for inputs, target in progress_bar:
        target = target.to(device)

        # Clean old gradients
        optimizer.zero_grad()

        # Forwards pass
        output = model(inputs)

        # Calculate how wrong the model is
        loss = criterion(output, target)

        # Perform gradient descent, backwards pass
        loss.backward()

        # Take a step in the right direction
        optimizer.step()
        scheduler.step()

        # Record metrics
        total_loss += loss.item()
        total += len(target)

    return total_loss / total


def validate_epoch(model, valid_loader):
    model.eval()
    total_loss = total = 0
    with torch.no_grad():
        progress_bar = tqdm_notebook(valid_loader, desc='Validating', leave=False)
        for inputs, target in progress_bar:
            target = target.to(device)

            # Forwards pass
            output = model(inputs)

            # Calculate how wrong the model is
            loss = criterion(output, target)

            # Record metrics
            total_loss += loss.item()
            total += len(target)

    return total_loss / total

In [14]:
from tqdm import tqdm, tqdm_notebook

n_epochs = 0
train_losses, valid_losses = [], []
for _ in range(max_epochs):
    train_loss = train_epoch(model, optimizer, scheduler, train_loader)
    valid_loss = validate_epoch(model, valid_loader)

    tqdm.write(
        f'epoch #{n_epochs + 1:3d}\ttrain_loss: {train_loss:.2e}'
        f'\tvalid_loss: {valid_loss:.2e}\n',
    )

    # Early stopping if the current valid_loss is greater than the last three valid losses
    if len(valid_losses) > 2 and all(valid_loss >= loss
                                     for loss in valid_losses[-3:]):
        print('Stopping early')
        break

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    n_epochs += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(train_loader, desc='Training', leave=False)


Training:   0%|          | 0/108 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(valid_loader, desc='Validating', leave=False)


Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  1	train_loss: 1.08e-02	valid_loss: 1.08e-02



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  2	train_loss: 1.06e-02	valid_loss: 1.03e-02



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  3	train_loss: 1.00e-02	valid_loss: 9.82e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  4	train_loss: 9.42e-03	valid_loss: 9.20e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  5	train_loss: 8.94e-03	valid_loss: 9.16e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  6	train_loss: 8.43e-03	valid_loss: 8.76e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  7	train_loss: 8.38e-03	valid_loss: 8.89e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  8	train_loss: 8.04e-03	valid_loss: 8.96e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch #  9	train_loss: 7.87e-03	valid_loss: 8.81e-03



Training:   0%|          | 0/108 [00:00<?, ?it/s]

Validating:   0%|          | 0/12 [00:00<?, ?it/s]

epoch # 10	train_loss: 7.68e-03	valid_loss: 8.78e-03



In [None]:
for X, y in valid_loader:
    pred = torch.argmax(model(X), dim=1)
    print(torch.eq(pred, y).sum()/torch.numel(pred)*100)
    break

In [None]:
import matplotlib.pyplot as plt
epoch_ticks = range(1, n_epochs + 1)
plt.plot(epoch_ticks, train_losses)
plt.plot(epoch_ticks, valid_losses)
plt.legend(['Train Loss', 'Valid Loss'])
plt.title('Losses')
plt.xlabel('Epoch #')
plt.ylabel('Loss')
plt.xticks(epoch_ticks)
plt.show()

# **Test and Submission**

In [None]:
class Dataset(Dataset):
    def __init__(self, X):
        self.X = X
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, idx):
        X = self.X[idx]
        return X

test_set = Dataset(X_test)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

In [None]:
def test(model, test_loader, preds_list):
    with torch.no_grad():
        for X in test_loader:
            preds = model(X)
            preds = preds.argmax(dim=-1)
            preds_list = preds_list + list(preds)
        return preds_list
            
preds_list = []
preds_list = test(model, test_loader, preds_list)
preds_list = list(map(lambda x: int(x), preds_list))

In [None]:
df_sub = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
df_sub['target'] = preds_list
df_sub.to_csv('submission.csv', index=False)