In [20]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
from torchtext.data.utils import get_tokenizer
import re
import tensorflow as tf
from nltk.tokenize import word_tokenize



In [3]:
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

print("Number of data points in training data:", df_train.shape[0])
print("Number of data points in test data:", df_test.shape[0])

Number of data points in training data: 1306122
Number of data points in test data: 375806


In [4]:
# Replacing math equations and url addresses with tags.
def clean_tag(x):
  if '[math]' in x:
    x = re.sub('\[math\].*?math\]', 'MATH EQUATION', x) #replacing with [MATH EQUATION]
    
  if 'http' in x or 'www' in x:
    x = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+', 'URL', x) #replacing with [url]
  return x

In [5]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', 
        '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 
        '█', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '¬', '░', '¡', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
        '—', '‹', '─', '▒', '：', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '⋅', '‘', '∞', 
        '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '・', '╦', '╣', '╔', '╗', '▬', '❤', '≤', '‡', '√', '◄', '━', 
        '⇒', '▶', '≥', '╝', '♡', '◊', '。', '✈', '≡', '☺', '✔', '↵', '≈', '✓', '♣', '☎', '℃', '◦', '└', '‟', '～', '！', '○', 
        '◆', '№', '♠', '▌', '✿', '▸', '⁄', '□', '❖', '✦', '．', '÷', '｜', '┃', '／', '￥', '╠', '↩', '✭', '▐', '☼', '☻', '┐', 
        '├', '«', '∼', '┌', '℉', '☮', '฿', '≦', '♬', '✧', '〉', '－', '⌂', '✖', '･', '◕', '※', '‖', '◀', '‰', '\x97', '↺', 
        '∆', '┘', '┬', '╬', '،', '⌘', '⊂', '＞', '〈', '⎙', '？', '☠', '⇐', '▫', '∗', '∈', '≠', '♀', '♔', '˚', '℗', '┗', '＊', 
        '┼', '❀', '＆', '∩', '♂', '‿', '∑', '‣', '➜', '┛', '⇓', '☯', '⊖', '☀', '┳', '；', '∇', '⇑', '✰', '◇', '♯', '☞', '´', 
        '↔', '┏', '｡', '◘', '∂', '✌', '♭', '┣', '┴', '┓', '✨', '\xa0', '˜', '❥', '┫', '℠', '✒', '［', '∫', '\x93', '≧', '］', 
        '\x94', '∀', '♛', '\x96', '∨', '◎', '↻', '⇩', '＜', '≫', '✩', '✪', '♕', '؟', '₤', '☛', '╮', '␊', '＋', '┈', '％', 
        '╋', '▽', '⇨', '┻', '⊗', '￡', '।', '▂', '✯', '▇', '＿', '➤', '✞', '＝', '▷', '△', '◙', '▅', '✝', '∧', '␉', '☭', 
        '┊', '╯', '☾', '➔', '∴', '\x92', '▃', '↳', '＾', '׳', '➢', '╭', '➡', '＠', '⊙', '☢', '˝', '∏', '„', '∥', '❝', '☐', 
        '▆', '╱', '⋙', '๏', '☁', '⇔', '▔', '\x91', '➚', '◡', '╰', '\x85', '♢', '˙', '۞', '✘', '✮', '☑', '⋆', 'ⓘ', '❒', 
        '☣', '✉', '⌊', '➠', '∣', '❑', '◢', 'ⓒ', '\x80', '〒', '∕', '▮', '⦿', '✫', '✚', '⋯', '♩', '☂', '❞', '‗', '܂', '☜', 
        '‾', '✜', '╲', '∘', '⟩', '＼', '⟨', '·', '✗', '♚', '∅', 'ⓔ', '◣', '͡', '‛', '❦', '◠', '✄', '❄', '∃', '␣', '≪', '｢', 
        '≅', '◯', '☽', '∎', '｣', '❧', '̅', 'ⓐ', '↘', '⚓', '▣', '˘', '∪', '⇢', '✍', '⊥', '＃', '⎯', '↠', '۩', '☰', '◥', 
        '⊆', '✽', '⚡', '↪', '❁', '☹', '◼', '☃', '◤', '❏', 'ⓢ', '⊱', '➝', '̣', '✡', '∠', '｀', '▴', '┤', '∝', '♏', 'ⓐ', 
        '✎', ';', '␤', '＇', '❣', '✂', '✤', 'ⓞ', '☪', '✴', '⌒', '˛', '♒', '＄', '✶', '▻', 'ⓔ', '◌', '◈', '❚', '❂', '￦', 
        '◉', '╜', '̃', '✱', '╖', '❉', 'ⓡ', '↗', 'ⓣ', '♻', '➽', '׀', '✲', '✬', '☉', '▉', '≒', '☥', '⌐', '♨', '✕', 'ⓝ', 
        '⊰', '❘', '＂', '⇧', '̵', '➪', '▁', '▏', '⊃', 'ⓛ', '‚', '♰', '́', '✏', '⏑', '̶', 'ⓢ', '⩾', '￠', '❍', '≃', '⋰', '♋', 
        '､', '̂', '❋', '✳', 'ⓤ', '╤', '▕', '⌣', '✸', '℮', '⁺', '▨', '╨', 'ⓥ', '♈', '❃', '☝', '✻', '⊇', '≻', '♘', '♞', 
        '◂', '✟', '⌠', '✠', '☚', '✥', '❊', 'ⓒ', '⌈', '❅', 'ⓡ', '♧', 'ⓞ', '▭', '❱', 'ⓣ', '∟', '☕', '♺', '∵', '⍝', 'ⓑ', 
        '✵', '✣', '٭', '♆', 'ⓘ', '∶', '⚜', '◞', '்', '✹', '➥', '↕', '̳', '∷', '✋', '➧', '∋', '̿', 'ͧ', '┅', '⥤', '⬆', '⋱', 
        '☄', '↖', '⋮', '۔', '♌', 'ⓛ', '╕', '♓', '❯', '♍', '▋', '✺', '⭐', '✾', '♊', '➣', '▿', 'ⓑ', '♉', '⏠', '◾', '▹', 
        '⩽', '↦', '╥', '⍵', '⌋', '։', '➨', '∮', '⇥', 'ⓗ', 'ⓓ', '⁻', '⎝', '⌥', '⌉', '◔', '◑', '✼', '♎', '♐', '╪', '⊚', 
        '☒', '⇤', 'ⓜ', '⎠', '◐', '⚠', '╞', '◗', '⎕', 'ⓨ', '☟', 'ⓟ', '♟', '❈', '↬', 'ⓓ', '◻', '♮', '❙', '♤', '∉', '؛', 
        '⁂', 'ⓝ', '־', '♑', '╫', '╓', '╳', '⬅', '☔', '☸', '┄', '╧', '׃', '⎢', '❆', '⋄', '⚫', '̏', '☏', '➞', '͂', '␙', 
        'ⓤ', '◟', '̊', '⚐', '✙', '↙', '̾', '℘', '✷', '⍺', '❌', '⊢', '▵', '✅', 'ⓖ', '☨', '▰', '╡', 'ⓜ', '☤', '∽', '╘', 
        '˹', '↨', '♙', '⬇', '♱', '⌡', '⠀', '╛', '❕', '┉', 'ⓟ', '̀', '♖', 'ⓚ', '┆', '⎜', '◜', '⚾', '⤴', '✇', '╟', '⎛', 
        '☩', '➲', '➟', 'ⓥ', 'ⓗ', '⏝', '◃', '╢', '↯', '✆', '˃', '⍴', '❇', '⚽', '╒', '̸', '♜', '☓', '➳', '⇄', '☬', '⚑', 
        '✐', '⌃', '◅', '▢', '❐', '∊', '☈', '॥', '⎮', '▩', 'ு', '⊹', '‵', '␔', '☊', '➸', '̌', '☿', '⇉', '⊳', '╙', 'ⓦ', 
        '⇣', '｛', '̄', '↝', '⎟', '▍', '❗', '״', '΄', '▞', '◁', '⛄', '⇝', '⎪', '♁', '⇠', '☇', '✊', 'ி', '｝', '⭕', '➘', 
        '⁀', '☙', '❛', '❓', '⟲', '⇀', '≲', 'ⓕ', '⎥', '\u06dd', 'ͤ', '₋', '̱', '̎', '♝', '≳', '▙', '➭', '܀', 'ⓖ', '⇛', '▊', 
        '⇗', '̷', '⇱', '℅', 'ⓧ', '⚛', '̐', '̕', '⇌', '␀', '≌', 'ⓦ', '⊤', '̓', '☦', 'ⓕ', '▜', '➙', 'ⓨ', '⌨', '◮', '☷', 
        '◍', 'ⓚ', '≔', '⏩', '⍳', '℞', '┋', '˻', '▚', '≺', 'ْ', '▟', '➻', '̪', '⏪', '̉', '⎞', '┇', '⍟', '⇪', '▎', '⇦', '␝', 
        '⤷', '≖', '⟶', '♗', '̴', '♄', 'ͨ', '̈', '❜', '̡', '▛', '✁', '➩', 'ா', '˂', '↥', '⏎', '⎷', '̲', '➖', '↲', '⩵', '̗', '❢', 
        '≎', '⚔', '⇇', '̑', '⊿', '̖', '☍', '➹', '⥊', '⁁', '✢']

def clean_punct(x):
  x = str(x)
  for punct in puncts:
    if punct in x:
      x = x.replace(punct, ' ')
    return x

In [6]:
# remove stopwords
def remove_stopwords(x):
  x = [word for word in x.split() if word not in STOPWORDS]
  x = ' '.join(x)

  return x

# word lemmatizing
lemmatizer = WordNetLemmatizer()
def lemma_text(x):
  x = x.split()
  x = [lemmatizer.lemmatize(word) for word in x]
  x = ' '.join(x)

  return x

In [7]:
def data_cleaning(x):
  x = clean_tag(x)
  x = clean_punct(x)
  x = remove_stopwords(x)
  x = lemma_text(x)
  return x

In [8]:
# preprocessing given train and test data
df_train['preprocessed_question_text'] = df_train['question_text'].map(lambda x: data_cleaning(x))
df_test['preprocessed_question_text'] = df_test['question_text'].map(lambda x: data_cleaning(x))

In [9]:
df_test

Unnamed: 0,qid,question_text,preprocessed_question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...,Why many woman become rude arrogant little bit...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...,When I apply RV college engineering BMS colleg...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...,What really nurse practitioner?
3,000086e4b7e1c7146103,Who are entrepreneurs?,Who entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?,Is education really making good people nowadays?
...,...,...,...
375801,ffff7fa746bd6d6197a9,How many countries listed in gold import in in...,How many country listed gold import indua?
375802,ffffa1be31c43046ab6b,Is there an alternative to dresses on formal p...,Is alternative dress formal parties?
375803,ffffae173b6ca6bfa563,Where I can find best friendship quotes in Tel...,Where I find best friendship quote Telugu?
375804,ffffb1f7f1a008620287,What are the causes of refraction of light?,What cause refraction light?


In [10]:
# Split the DataFrame into features (text) and labels
X = df_train['preprocessed_question_text']
y = df_train['target']

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [44]:
tokens = [word for sentence in X for word in sentence.split()]

# Count unique tokens
vocabulary_size = len(set(tokens))

In [45]:
vocabulary_size

461951

In [26]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
# Tokenization function
def vectorize(text):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    input_sequence = pad_sequences(sequences,maxlen = 20, padding = 'pre', truncating = 'post')
    return input_sequence
# in this case scemantic meaning is not captured, the words are just represented by their index in the vocabulary list (scalar)
# To capture the scemantic meaning we need to convert into word embeddings which can be trained for a particular corpus or can be derived using pretrained 
# word embeddings  
# Tokenize the text data
X_train_padded = vectorize(X_train)
X_val_padded = vectorize(X_val)
X_test_padded = vectorize(X_test)

In [31]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np

# Convert Pandas Series to NumPy arrays
y_train_np = y_train.to_numpy()
y_val_np = y_val.to_numpy()
y_test_np = y_test.to_numpy()

# Convert NumPy arrays to PyTorch tensors
y_train_tensor = torch.LongTensor(y_train_np)
y_val_tensor = torch.LongTensor(y_val_np)
y_test_tensor = torch.LongTensor(y_test_np)
X_train_tensor = torch.LongTensor(X_train_padded)
X_val_tensor = torch.LongTensor(X_val_padded)
X_test_tensor = torch.LongTensor(X_test_padded)
# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]



In [32]:
# Create datasets
train_dataset = TextDataset(X_train_tensor, y_train_tensor)
val_dataset = TextDataset(X_val_tensor, y_val_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

# Define batch size
batch_size = 64

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Test the data loaders
for batch in train_loader:
    inputs, labels = batch
    print("Inputs shape:", inputs.shape)
    print("Labels shape:", labels.shape)
    break  # Print the first batch only

Inputs shape: torch.Size([64, 20])
Labels shape: torch.Size([64])


In [46]:
import torch
import torch.nn as nn
import torch.optim as optim

import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        # Take the hidden state from the last time step
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return output

# Define the input size, hidden size, and output size
input_size = vocabulary_size 
hidden_size = 128
output_size = 2  # Assuming binary classification

# Create the model instance
model = SimpleRNN(input_size, hidden_size, output_size)

# Print the model architecture
print(model)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

SimpleRNN(
  (embedding): Embedding(461951, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)


In [49]:
import torch.optim as optim
# Define hyperparameters
learning_rate = 0.001
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    # Iterate over the training data
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()  # Zero the gradients
        
        # Forward pass
        outputs = model(inputs)
        
        # Calculate loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Calculate average loss for the epoch
    train_loss = running_loss / len(train_loader)
    
    # Print training loss for the epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')

print('Training finished')

Epoch [1/10], Loss: 0.1341
Epoch [2/10], Loss: 0.1147
Epoch [3/10], Loss: 0.1084
Epoch [4/10], Loss: 0.1046
Epoch [5/10], Loss: 0.1010
Epoch [6/10], Loss: 0.0979
Epoch [7/10], Loss: 0.0949
Epoch [8/10], Loss: 0.0915
Epoch [9/10], Loss: 0.0886
Epoch [10/10], Loss: 0.0860
Training finished


In [None]:
from sklearn.metrics import f1_score

# Define function for testing
def test_model(model, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    correct = 0
    total = 0
    predicted_labels = []
    true_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to device
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            predicted_labels.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = correct / total
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    return accuracy, f1

# Test the model
test_accuracy, test_f1 = test_model(model, test_loader, criterion)
print(f"Test Accuracy: {test_accuracy:.2%}")
print(f"Test F1 Score: {test_f1:.4f}")

In [57]:
def test_one_epoch():

    model.train(False)
    running_loss = 0.0
    running_accuracy = 0.0
    predictions = []
    true_labels = []

    for i, data in enumerate(test_loader):
        input, labels = data[0].to(device), data[1].to(device)

        with torch.no_grad():
            outputs = model(input) # shape: [batch_size, 10]
            _,predicted = torch.max(outputs,1)
            correct = torch.sum(labels == torch.argmax(outputs, dim=1)).item()
            running_accuracy += correct / batch_size
            loss = criterion(outputs, labels) # One number, the average batch loss
            running_loss += loss.item()
            predictions.extend(predicted.cpu().numpy())  # Move predictions back to CPU
            true_labels.extend(labels.cpu().numpy())  # Move true labels back to CPU

    avg_loss_across_batches = running_loss / len(test_loader)
    avg_acc_across_batches = (running_accuracy / len(test_loader)) * 100
    f1 = f1_score(true_labels, predictions, average='weighted')

    print('test Loss: {0:.5f}, test Accuracy: {1:.3f}%'.format(avg_loss_across_batches,
                                                            avg_acc_across_batches))
    print('***************************************************')
    print()
    print(f"F1 Score: {f1:.4f}")

    return avg_loss_across_batches,avg_acc_across_batches
        

In [58]:
loss, accuracy = test_one_epoch()

test Loss: 0.33263, test Accuracy: 92.592%
***************************************************

F1 Score: 0.9079


In [59]:
import pickle

# Define the file path to save the model
model_path = 'model.pkl'

# Move the model to CPU
model_cpu = model.to('cpu')

# Serialize and save the model
with open(model_path, 'wb') as f:
    pickle.dump(model_cpu, f)

# # Load the model from file
# with open(model_path, 'rb') as f:
#     loaded_model = pickle.load(f)

# # Ensure the model is on the correct device (e.g., GPU)
# loaded_model = loaded_model.to(device)

# # Set the model to evaluation mode
# loaded_model.eval()