In [1]:
# Importing required libraries


import pandas as pd
import text_utils
import model_utils
import configs
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Importing data

train=pd.read_csv('./Corona_NLP_train.csv',encoding='latin1')
test=pd.read_csv('./Corona_NLP_test.csv')
print('Number of training shape',train.shape)
print('Number of testing records',test.shape)
print('Number of unique labels in train',train['Sentiment'].nunique())
print('Number of unique labels in test',test['Sentiment'].nunique())

In [None]:
# Seeding everything for reproducability
text_utils.seed_everything()

In [None]:
# Text classification pipeline

processor=text_utils.text_processor()
train['cleaned_tweet']=train['OriginalTweet'].apply(lambda x:processor.process_text(x))
test['cleaned_tweet']=test['OriginalTweet'].apply(lambda x:processor.process_text(x))
train.head()       

In [None]:
# Check word count distribution to identify the length of the sequence (max_len)
lengths = train['cleaned_tweet'].apply(lambda s : len(s))
lengths.plot.hist(bins=100)

In [None]:
## Tokenize the sentences and pad sequences

tokenizer = Tokenizer(num_words=configs.max_features)
tokenizer.fit_on_texts(list(train['cleaned_tweet']))
train_X = tokenizer.texts_to_sequences(train['cleaned_tweet'])
test_X = tokenizer.texts_to_sequences(test['cleaned_tweet'])

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=configs.maxlen)
test_X = pad_sequences(test_X, maxlen=configs.maxlen)

In [None]:
# Encoding output variable
le = LabelEncoder()
train_y = le.fit_transform(list(train['Sentiment']))
test_y = le.transform(list(test['Sentiment']))

# Savign label encoder for inference pipeline
output = open('label_encoder.pkl', 'wb')
pickle.dump(le, output)
output.close()

In [None]:
# Loading the embedding matrix from glove pretrained embedding

embedding_matrix = text_utils.load_glove(tokenizer.word_index,configs.max_features)

In [None]:
# Loading model architecture

model=model_utils.BiLSTM(configs.hidden_size,configs.num_classes,configs.dropout,configs.max_features,configs.embed_size,
                         embedding_matrix)

In [None]:
# GPU check

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
# Loading data and model configurations

loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)


# Convert train and test numpy arrays as tensors
x_train = torch.tensor(train_X, dtype=torch.long)
y_train = torch.tensor(train_y, dtype=torch.long)
x_cv = torch.tensor(test_X, dtype=torch.long)
y_cv = torch.tensor(test_y, dtype=torch.long)

# Packaging features and labels together
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

# Loading into data loader for batch operation
train_loader = torch.utils.data.DataLoader(train, batch_size=configs.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=configs.batch_size, shuffle=False)


In [None]:
# Training the model

train_loss = []
valid_loss = []

for epoch in range(configs.n_epochs):
    start_time = time.time()
    model.train()
    avg_loss = 0  
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        # Back prop
        optimizer.zero_grad()
        loss.backward()
        # Optimizer step
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_cv),len(le.classes_)))
    
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()
    
    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))

In [None]:
# Saving model weights

torch.save(model.state_dict(), './bilstm.pt')

In [None]:
# Debugging

In [3]:
from fastapi import FastAPI
import torch
import model_utils
import configs
from pydantic import BaseModel
import numpy as np

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

app = FastAPI(title="NLP tester", description="API for NLP use cases", version="1.0")

model = model_utils.BiLSTM(configs.hidden_size,configs.num_classes,configs.dropout,
configs.max_features,configs.embed_size,np.zeros((configs.max_features, configs.embed_size)))
model.load_state_dict(torch.load('./model_files/bilstm.pt'))

ModuleNotFoundError: No module named 'model_utils'

In [1]:
import requests,json
payload = json.dumps({
  "text": 'Hello'
})
response = requests.put("http://127.0.0.1:8000/predict",data = payload)
data_dict = response.json()

In [2]:
data_dict

{'detail': 'Method Not Allowed'}