In [21]:
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
import torchtext 
import random
import re

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if __name__=='__main__':
    print('Using device:', device)

Using device: cuda


### Import the Data

*   To access the list of textual tokens for the *i*th example, use `train_data[i][1]`
*   To access the label for the *i*th example, use `train_data[i][0]`


In [23]:
import io
spath = "/content/drive/MyDrive/new_emails.csv"
train_data = pd.read_csv(spath)

In [24]:
train_data.head(2)

Unnamed: 0,sentiment,text
0,neg,XXXX has claimed I owe them {$27.00} for XXXX ...
1,neu,Due to inconsistencies in the amount owed that...


In [25]:
def preprocess(review):
    '''
    Simple preprocessing function.
    '''
    res = []

    review = re.sub('\W+',' ', review)
    res = review.split(' ')
    res = res[:-1]

    return res

In [26]:
train_data, test_data = np.split(train_data.sample(frac=1, random_state=42),[int(.7*len(train_data))])

In [27]:
test_data = test_data.values.tolist()
train_data = train_data.values.tolist()

train_data = [(x[0], preprocess(x[1])) for x in train_data]
test_data = [(x[0], preprocess(x[1])) for x in test_data]


In [28]:
for x in random.sample(train_data, 2):
    print('Sample text:', x[1])
    print('Sample label:', x[0], '\n')

Sample text: ['We', 'have', 'been', 'Bank', 'of', 'America', 'customers', 'for', 'well', 'over', '20', 'years', 'and', 'on', 'XXXX', 'XXXX', '2016', 'my', 'entire', 'family', 'received', 'letters', 'individually', 'that', 'each', 'of', 'our', 'accounts', 'was', 'going', 'to', 'be', 'closed', 'in', '30', 'days', 'with', 'no', 'explanation', 'It', 'advised', 'our', 'account', 'would', 'be', 'restricted', 'in', '21', 'days', 'not', 'stating', 'what', 'those', 'restrictions', 'would', 'be', 'and', 'we', 'had', '30', 'days', 'from', 'the', 'date', 'of', 'the', 'letter', 'which', 'was', 'XXXX', 'XXXX', '2016', 'to', 'make', 'other', 'banking', 'arrangements', 'We', 'have', 'never', 'had', 'any', 'inappropriate', 'banking', 'practices', 'and', 'have', 'had', 'several', 'accounts', 'throughout', 'the', 'years', 'with', 'no', 'issues', 'So', 'this', 'comes', 'as', 'a', 'complete', 'shock', 'to', 'say', 'the', 'least', 'My', 'husband', 'and', 'I', 'have', 'had', 'our', 'checking', 'account', 'fo

### Data Loader

In [29]:
PAD = '<PAD>'
END = '<END>'
UNK = '<UNK>'

class TextDataset(data.Dataset):
    def __init__(self, examples, split, threshold, max_len, idx2word=None, word2idx=None):

        self.examples = examples
        assert split in {'train', 'val', 'test'}
        self.split = split
        self.threshold = threshold
        self.max_len = max_len

        # Dictionaries
        self.idx2word = idx2word
        self.word2idx = word2idx
        if split == 'train':
            self.build_dictionary()
        self.vocab_size = len(self.word2idx)
        
        # Convert text to indices
        self.textual_ids = []
        self.convert_text()

    
    def build_dictionary(self): 
        '''
        Build the dictionaries idx2word and word2idx. This is only called when split='train'
        Returns nothing.
        '''
        assert self.split == 'train'
        
        self.idx2word = {0:PAD, 1:END, 2: UNK}
        self.word2idx = {PAD:0, END:1, UNK: 2}

        pre_dict = {PAD:0, END:1, UNK: 2}

        for sentence in self.examples:
          for word in sentence[1]:
            word = word.lower()
            if word not in self.word2idx:
              self.word2idx[word] = self.word2idx.get(word, 0) + 1
            else:
              self.word2idx[word] += 1

        for (k,v) in self.word2idx.items():
          if v >= self.threshold:
            pre_dict[k] = v
          

        self.word2idx = pre_dict

        list_keys = [* self.word2idx.keys()]
        for item in list_keys:
          self.idx2word[list_keys.index(item)]= item
          self.word2idx[item] = list_keys.index(item)

        pass
    
    def convert_text(self):
        '''
        Converts each email in the dataset to a list of indices, given by self.word2idx.
        Store this in self.textual_ids; returns nothing.
        '''

        for sentence in self.examples:
         temp_l = []
         for word in sentence[1]:
           word = word.lower()
           if word not in self.word2idx:
              word = UNK
           index = self.word2idx.get(word)
           temp_l.append(index)
         word = END
         index = self.word2idx.get(word)
         temp_l.append(index)
         self.textual_ids.append(temp_l)

        pass

    def get_text(self, idx):
        '''
        Returns the email at idx as a long tensor (torch.LongTensor) of integers corresponding to the words in the email.
        Padded as necessary (see above).
        '''
        ##### TODO #####
        review = self.textual_ids[idx]
        if len(review) < self.max_len:
          padindex = list(self.word2idx.keys()).index(PAD)
          review += [padindex] * (self.max_len - len(review))

        review = review[:self.max_len]
        return torch.tensor(review, dtype=torch.long, device = None)
        #return review
    
    def get_label(self, idx):
        '''
        This function returns 0 for 'negative' sentiment, 1 if the sentiment for idx in the dataset is 'neutral', 
        and 2 if it is 'positive'. The return type is torch.LongTensor.
        '''
        result = self.examples[idx][0]
        retVal = 1
        if result == 'neg':
          retVal = 0
        elif result == 'pos':
          retVal = 2
        else: retVal = 1

        return torch.as_tensor(retVal, dtype=torch.long, device = None)
        

    def __len__(self):
        '''
        Returns the number of emails (int value) in the dataset
        '''
        return int(len(self.examples))
    
    def __getitem__(self, idx):
        '''
        Returns the email, and label/sentiment of the email specified by idx.
        '''
        rtext = self.get_text(idx)
        rlab = self.get_label(idx)

        return rtext, rlab

if __name__=='__main__':
    # Sample item
    Ds = TextDataset(train_data, 'train', threshold=10, max_len=150)
    print('Vocab size:', Ds.vocab_size)

    text, label = Ds[random.randint(0, len(Ds))]
    print('Example text:', text)
    print('Example label:', label)

Vocab size: 10091
Example text: tensor([ 285, 2834,   20,  283,   15,  285,   45,   85,  101,   27,  250,  289,
         879,    3,   81,   15,  603,  311,    7,   15,  380,    3,   81,   15,
         220,  919,    7,   49,  380,  721,    3,   51, 2591,   49,  782,    3,
          81,  123,  876,   15,  311,   19,    3,  857,    3,   35,   21,  736,
           3,   36,  159,    3, 9318,   19,    3,   68,  881,  767,  182, 4295,
         100,   49,  380, 2310,  195,   44,  859,  187, 1318,   21,   49,  380,
         182,  285, 2834,  299,    5,  112,   33,  623,   33,   86,  231, 6823,
          56,   49,  793,  311,  153,    3, 5383,  159,   49,  380,  134,   49,
         311,   33,  285, 2834,  198,  534,    3,   51,  139,   75,  103,   63,
        2033, 1856,  302,  217,  285, 2834,  134,   86,   15,  135, 2915,   49,
         220,   54,  370,    3,  265,    3,  130,   75,    3,   51, 2591,   49,
         782,   10,   19,    3,  215,   75,   33,  955,   86,   15,  956,   56,
        

### Train a Convolutional Neural Network (CNN)

In [31]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, out_channels, filter_heights, stride, dropout, num_classes, pad_idx):
        super(CNN, self).__init__()
        
        ##### TODO #####
        # Creates an embedding layer (https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)
        #   to represent the words in your vocabulary.
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx= pad_idx)         

        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = out_channels,
                                              stride = stride, 
                                              kernel_size = (fs, embed_size)) 
                                    for fs in filter_heights
                                    ])         

        # Creates a dropout layer (nn.Dropout) using dropout
        self.dropout = nn.Dropout(dropout)        

        # Defines a linear layer (nn.Linear) that consists of num_classes units 
        self.fc = nn.Linear(len(filter_heights) * out_channels, num_classes)        

    def forward(self, texts):
      """
      texts: LongTensor [batch_size, max_len]
          
      Returns output: Tensor [batch_size, num_classes]
      """
      embedded = self.embedding(texts)
      embedded = embedded.unsqueeze(1)
      conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
      pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
      cat = self.dropout(torch.cat(pooled, dim = 1))

      return self.fc(cat)

### Train CNN Model

In [32]:
if __name__=='__main__':
    THRESHOLD = 5 
    MAX_LEN = 100 
    BATCH_SIZE = 32 

    train_Ds = TextDataset(train_data, 'train', THRESHOLD, MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_Ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

    test_Ds = TextDataset(test_data, 'test', THRESHOLD, MAX_LEN, train_Ds.idx2word, train_Ds.word2idx)
    test_loader = torch.utils.data.DataLoader(test_Ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)


In [33]:
from tqdm.notebook import tqdm

In [34]:
def train_model(model, num_epochs, data_loader, optimizer, criterion):
    print('Training Model...')
    model.train()
    for epoch in tqdm(range(num_epochs)):
        epoch_loss = 0
        epoch_acc = 0
        for texts, labels in data_loader:
            texts = texts.to(device) # shape: [batch_size, MAX_LEN]
            labels = labels.to(device) # shape: [batch_size]

            optimizer.zero_grad()

            output = model(texts)
            acc = accuracy(output, labels)
            
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        print('[TRAIN]\t Epoch: {:2d}\t Loss: {:.4f}\t Train Accuracy: {:.2f}%'.format(epoch+1, epoch_loss/len(data_loader), 100*epoch_acc/len(data_loader)))
    print('Model Trained!\n')

In [35]:
def count_parameters(model):
    """
    Count number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def accuracy(output, labels):
    """
    Returns accuracy per batch
    output: Tensor [batch_size, n_classes]
    labels: LongTensor [batch_size]
    """
    preds = output.argmax(dim=1) # find predicted class
    correct = (preds == labels).sum().float() # convert into float for division 
    acc = correct / len(labels)

    return acc

In [36]:
if __name__=='__main__':
    cnn_model = CNN(vocab_size = train_Ds.vocab_size, # Don't change this
                embed_size = 128, 
                out_channels = 64, 
                filter_heights = [2, 3, 4], 
                stride = 1, 
                dropout = 0.5, 
                num_classes = 3, # neg,neu,pos
                pad_idx = train_Ds.word2idx[PAD]) 

    # Put your model on the device (cuda or cpu)
    cnn_model = cnn_model.to(device)
    
    print('The model has {:,d} trainable parameters'.format(count_parameters(cnn_model)))


The model has 1,841,027 trainable parameters


In [37]:
if __name__=='__main__':    
    LEARNING_RATE = 5e-4 

    # Define the loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Define the optimizer
    optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)

In [38]:
if __name__=='__main__':    
    N_EPOCHS = 25 
    
    # train model for N_EPOCHS epochs
    train_model(cnn_model, N_EPOCHS, train_loader, optimizer, criterion)


Training Model...


  0%|          | 0/25 [00:00<?, ?it/s]

[TRAIN]	 Epoch:  1	 Loss: 0.7675	 Train Accuracy: 66.51%
[TRAIN]	 Epoch:  2	 Loss: 0.6798	 Train Accuracy: 70.93%
[TRAIN]	 Epoch:  3	 Loss: 0.6326	 Train Accuracy: 73.18%
[TRAIN]	 Epoch:  4	 Loss: 0.5979	 Train Accuracy: 74.81%
[TRAIN]	 Epoch:  5	 Loss: 0.5660	 Train Accuracy: 76.34%
[TRAIN]	 Epoch:  6	 Loss: 0.5405	 Train Accuracy: 77.53%
[TRAIN]	 Epoch:  7	 Loss: 0.5086	 Train Accuracy: 79.04%
[TRAIN]	 Epoch:  8	 Loss: 0.4825	 Train Accuracy: 80.38%
[TRAIN]	 Epoch:  9	 Loss: 0.4569	 Train Accuracy: 81.61%
[TRAIN]	 Epoch: 10	 Loss: 0.4324	 Train Accuracy: 82.75%
[TRAIN]	 Epoch: 11	 Loss: 0.4107	 Train Accuracy: 83.86%
[TRAIN]	 Epoch: 12	 Loss: 0.3853	 Train Accuracy: 84.91%
[TRAIN]	 Epoch: 13	 Loss: 0.3665	 Train Accuracy: 85.61%
[TRAIN]	 Epoch: 14	 Loss: 0.3460	 Train Accuracy: 86.63%
[TRAIN]	 Epoch: 15	 Loss: 0.3256	 Train Accuracy: 87.46%
[TRAIN]	 Epoch: 16	 Loss: 0.3087	 Train Accuracy: 88.12%
[TRAIN]	 Epoch: 17	 Loss: 0.2925	 Train Accuracy: 88.69%
[TRAIN]	 Epoch: 18	 Loss: 0.279

### Evaluate CNN Model

In [39]:
import random

def evaluate(model, data_loader, criterion):
    print('Evaluating performance on the test dataset...')
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    print("\nSOME PREDICTIONS FROM THE MODEL:")
    for texts, labels in tqdm(data_loader):
        texts = texts.to(device)
        labels = labels.to(device)
        
        output = model(texts)
        acc = accuracy(output, labels)
        pred = output.argmax(dim=1)
        all_predictions.append(pred)
        
        loss = criterion(output, labels)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        if random.random() < 0.0015:
            print("Input: "+' '.join([data_loader.dataset.idx2word[idx] for idx in texts[0].tolist() if idx not in {data_loader.dataset.word2idx[PAD], data_loader.dataset.word2idx[END]}]))
            print("Prediction:", pred.item(), '\tCorrect Output:', labels.item(), '\n')

    full_acc = 100*epoch_acc/len(data_loader)
    full_loss = epoch_loss/len(data_loader)
    print('[TEST]\t Loss: {:.4f}\t Accuracy: {:.2f}%'.format(full_loss, full_acc))
    predictions = torch.cat(all_predictions)
    return predictions, full_acc, full_loss

if __name__=='__main__':
    evaluate(cnn_model, test_loader, criterion) # Compute test data accuracy


Evaluating performance on the test dataset...

SOME PREDICTIONS FROM THE MODEL:


  0%|          | 0/20042 [00:00<?, ?it/s]

Input: several months ago i made a purchase at a xxxx sears store at that time i changed my billing address giving the store my new address i thought the bill came in and was paid months ago yesterday i received xxxx sears bills in my mail box they were placed there by the person that purchased our previous home all with late charges today i called the xxxx number and spoke to xxxx citi bank reps i paid the original purchase however they will not waive the remainder late fee xxxx of the xxxx reps disconnected our phone conversation i
Prediction: 1 	Correct Output: 1 

Input: i have an account with synchrony bank banana republic the password to access my account does not work instead every time i log in i have to resubmit all my information account ssn security questions etc to take any action with my account this has been going on for months and they have failed to resolve it i suspect this error is a ploy to make it more difficult to pay my card so that they can get interest and fees 

### Export Model

In [40]:
if __name__=='__main__':
    from google.colab import drive
    drive.mount('/content/drive')
    print()

    try:
        cnn_model is None
        cnn_exists = True
    except:
        cnn_exists = False

   
    if cnn_exists:
        print("Saving CNN model....") 
        torch.save(cnn_model, "drive/My Drive/cnn.pt")
    
    print("Done!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Saving CNN model....
Done!
