In [1]:
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

In [3]:
tokenize('$1000')

['$', '1000']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,all_words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(all_words))
    for idx,w in enumerate(all_words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['hi','hey','hi'])

array([1., 0., 1.])

In [8]:
data = pd.read_csv('./data.csv')[:5005]

In [9]:
data.dropna(inplace=True)

In [10]:
data.sample(frac=1)

Unnamed: 0.1,Unnamed: 0,product,narrative
4763,4763,credit_reporting,receiving current copy credit report discovere...
4962,4962,credit_reporting,receiving current copy credit report discovere...
4377,4377,mortgages_and_loans,locked rate loan agent guided web side noticed...
1837,1837,debt_collection,month back contacted company called revco solu...
1980,1980,credit_reporting,hard inquiry submitted never authorized called...
...,...,...,...
1352,1352,credit_reporting,account appear credit without understanding ev...
4985,4985,credit_reporting,receiving notice furloughed employer called co...
480,480,credit_reporting,tx tx co co xxxxxxxx fraudulent application su...
3914,3914,credit_reporting,address name account mine sen ftc filed day ag...


In [11]:
data.drop(data.index[data['narrative'] == 'name'],inplace=True)

In [12]:
X = data['narrative']

In [13]:
y = data['product']

In [14]:
words = []
data = []
labels = {}
idx = 0

In [15]:
for X_batch,y_batch in zip(X,y):
    if y_batch not in list(labels.keys()):
        idx += 1
        labels[y_batch] = idx

In [16]:
labels

{'credit_card': 1,
 'retail_banking': 2,
 'credit_reporting': 3,
 'mortgages_and_loans': 4,
 'debt_collection': 5}

In [17]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    new_X = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    words.extend(new_X)
    data.append([X_batch,np.eye(labels[y_batch] + 1,len(labels))[labels[y_batch]]])    

5002it [00:12, 389.84it/s] 


In [18]:
words = sorted(set(words))

In [19]:
np.random.shuffle(data)

In [20]:
X = []
y = []

In [21]:
for sentence,tag in tqdm(data):
    X.append(bag_of_words(sentence,words))
    y.append(tag)

100%|██████████████████████████████████████| 5002/5002 [00:42<00:00, 118.70it/s]


In [22]:
from sklearn.model_selection import *

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)

In [25]:
device = 'cuda'

In [26]:
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [27]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [28]:
def get_accuracy(model,X,y):
    preds = model(X)
    correct = 0
    total = 0
    for pred,y_batch in zip(preds,y):
        pred = int(torch.argmax(pred))
        y_batch = int(torch.argmax(y_batch))
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [29]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.iters = 12
        self.activation = ReLU()
        self.linear1 = Linear(len(words),512)
        self.linear2 = Linear(512,512)
        self.output = Linear(512,len(labels))
    
    def forward(self,X):
        preds = self.linear1(X)
        for _ in range(self.iters):
            preds = self.activation(self.linear2(preds))
        perds = self.output(preds)
        return preds

In [30]:
model = Model().to(device)

In [31]:
criterion = MSELoss()

In [32]:
optimizer = Adam(model.parameters(),lr=0.001)

In [33]:
batch_size = 8

In [34]:
epochs = 100

In [35]:
import wandb

In [36]:
PROJECT_NAME = 'Consumer-Complaints-NLP'

In [37]:
torch.cuda.empty_cache()
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    torch.cuda.empty_cache()
    for i in range(0,len(X_train),batch_size):
        torch.cuda.empty_cache()
        X_batch = X_train[i:i+batch_size].to(device).float()
        y_batch = y_train[i:i+batch_size].to(device).float()
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()
torch.cuda.empty_cache()

[34m[1mwandb[0m: Currently logged in as: [33mranuga-d[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  return F.mse_loss(input, target, reduction=self.reduction)
  0%|                                                   | 0/100 [00:00<?, ?it/s]


RuntimeError: The size of tensor a (512) must match the size of tensor b (5) at non-singleton dimension 1