In [1]:
import nltk
from nltk.stem.porter import *
from torch.nn import *
from torch.optim import *
import numpy as np
import pandas as pd
import torch,torchvision
import random
from tqdm import *
from torch.utils.data import Dataset,DataLoader
stemmer = PorterStemmer()

In [2]:
def tokenize(sentence):
    return nltk.word_tokenize(sentence)

In [3]:
tokenize('$1000')

['$', '1000']

In [4]:
def stem(word):
    return stemmer.stem(word.lower())

In [5]:
stem('organic')

'organ'

In [6]:
def bag_of_words(tokenized_words,all_words):
    tokenized_words = [stem(w) for w in tokenized_words]
    bag = np.zeros(len(all_words))
    for idx,w in enumerate(all_words):
        if w in tokenized_words:
            bag[idx] = 1.0
    return bag

In [7]:
bag_of_words(['hi'],['hi','hey','hi'])

array([1., 0., 1.])

In [8]:
data = pd.read_csv('./data.csv')[:5005]

In [9]:
data.dropna(inplace=True)

In [10]:
data.sample(frac=1)

      Unnamed: 0           product  \
2358        2358  credit_reporting   
726          726    retail_banking   
1779        1779  credit_reporting   
92            92       credit_card   
1467        1467    retail_banking   
...          ...               ...   
4085        4085  credit_reporting   
426          426  credit_reporting   
4073        4073  credit_reporting   
1941        1941  credit_reporting   
2909        2909  credit_reporting   

                                              narrative  
2358  back lost wallet personal information inside t...  
726   transferred held fund two week utilized float ...  
1779  delinquent debt paid since credit bureau updat...  
92    regarding assertion informed record promotion ...  
1467  chase transaction erroneously sent amount expl...  
...                                                 ...  
4085  careful review credit report identified inaccu...  
426   except otherwise provided section consumer rep...  
4073  careful review 

In [11]:
data.drop(data.index[data['narrative'] == 'name'],inplace=True)

In [12]:
X = data['narrative']

In [13]:
y = data['product']

In [14]:
words = []
data = []
labels = {}
idx = 0

In [15]:
for X_batch,y_batch in zip(X,y):
    if y_batch not in list(labels.keys()):
        idx += 1
        labels[y_batch] = idx

In [16]:
labels

{'credit_card': 1,
 'retail_banking': 2,
 'credit_reporting': 3,
 'mortgages_and_loans': 4,
 'debt_collection': 5}

In [17]:
for X_batch,y_batch in tqdm(zip(X,y)):
    X_batch = tokenize(X_batch)
    new_X = []
    for Xb in X_batch:
        new_X.append(stem(Xb))
    words.extend(new_X)
    data.append([X_batch,np.eye(labels[y_batch] + 1,len(labels))[labels[y_batch]]])    

In [18]:
words = sorted(set(words))

In [19]:
np.random.shuffle(data)

In [20]:
X = []
y = []

In [21]:
for sentence,tag in tqdm(data):
    X.append(bag_of_words(sentence,words))
    y.append(tag)

In [22]:
from sklearn.model_selection import *

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,shuffle=False)

In [24]:
device = 'cuda'

In [25]:
X_train = torch.from_numpy(np.array(X_train)).to(device).float()
y_train = torch.from_numpy(np.array(y_train)).to(device).float()
X_test = torch.from_numpy(np.array(X_test)).to(device).float()
y_test = torch.from_numpy(np.array(y_test)).to(device).float()

In [26]:
def get_loss(model,X,y,criterion):
    preds = model(X)
    loss = criterion(preds,y)
    return loss.item()

In [27]:
def get_accuracy(model,X,y):
    preds = model(X)
    correct = 0
    total = 0
    for pred,y_batch in zip(preds,y):
        pred = int(torch.argmax(pred))
        y_batch = int(torch.argmax(y_batch))
        if pred == y_batch:
            correct += 1
        total += 1
    acc = round(correct/total,3)*100
    return acc

In [28]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.iters = 12
        self.activation = ReLU()
        self.linear1 = Linear(len(words),512)
        self.linear2 = Linear(512,512)
        self.output = Linear(512,len(labels))
    
    def forward(self,X):
        preds = self.linear1(X)
        for _ in range(self.iters):
            preds = self.activation(self.linear2(preds))
        perds = self.output(preds)
        return preds

In [29]:
model = Model().to(device)

In [30]:
criterion = MSELoss()

In [31]:
optimizer = Adam(model.parameters(),lr=0.001)

In [32]:
batch_size = 8

In [33]:
epochs = 100

In [34]:
import wandb

In [35]:
PROJECT_NAME = 'Consumer-Complaints-NLP'

In [36]:
torch.cuda.empty_cache()
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    torch.cuda.empty_cache()
    for i in range(0,len(X_train),batch_size):
        torch.cuda.empty_cache()
        X_batch = X_train[i:i+batch_size].to(device).float()
        y_batch = y_train[i:i+batch_size].to(device).float()
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()
torch.cuda.empty_cache()

In [37]:
class Model(Module):
    def __init__(self):
        super().__init__()
        self.activation = ReLU()
        self.iters = 10
        self.linear1 = Linear(len(words),512)
        self.linear2 = Linear(512,512)
        self.bn = BatchNorm1d(512)
        self.output = Linear(512,len(labels))
    
    def forward(self,X):
        preds = self.linear1(X)
        for _ in range(self.iters):
            preds = self.activation(self.bn(self.linear2(preds)))
        preds = self.output(preds)
        return preds

In [38]:
model = Model().to(device)

In [39]:
criterion = MSELoss()

In [40]:
optimizer = Adam(model.parameters(),lr=0.001)

In [41]:
batch_size = 8

In [42]:
epochs = 100

In [43]:
import wandb

In [44]:
PROJECT_NAME = 'Consumer-Complaints-NLP'

In [45]:
torch.cuda.empty_cache()
wandb.init(project=PROJECT_NAME,name='baseline')
for _ in tqdm(range(epochs)):
    torch.cuda.empty_cache()
    for i in range(0,len(X_train),batch_size):
        torch.cuda.empty_cache()
        X_batch = X_train[i:i+batch_size].to(device).float()
        y_batch = y_train[i:i+batch_size].to(device).float()
        preds = model(X_batch)
        loss = criterion(preds,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
    model.eval()
    torch.cuda.empty_cache()
    wandb.log({'Loss':(get_loss(model,X_train,y_train,criterion)+get_loss(model,X_batch,y_batch,criterion)/2)})
    torch.cuda.empty_cache()
    wandb.log({'Val Loss':get_loss(model,X_test,y_test,criterion)})
    torch.cuda.empty_cache()
    wandb.log({'Acc':(get_accuracy(model,X_train,y_train)+get_accuracy(model,X_batch,y_batch))/2})
    torch.cuda.empty_cache()
    wandb.log({'Val Acc':get_accuracy(model,X_test,y_test)})
    torch.cuda.empty_cache()
    model.train()
wandb.finish()
torch.cuda.empty_cache()