In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
import torchtext 
import random
import re

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if __name__=='__main__':
    print('Using device:', device)

Using device: cpu


### Import the Data

*   To access the list of textual tokens for the *i*th example, use `train_data[i][1]`
*   To access the label for the *i*th example, use `train_data[i][0]`


In [6]:
import io
spath = "/content/drive/MyDrive/new_emails.csv"
train_data = pd.read_csv(spath)

In [None]:
#train_data = pd.read_csv("./Cleaned_email/new_emails.csv")

In [7]:
train_data.head(2)

Unnamed: 0,sentiment,text
0,-1,XXXX has claimed I owe them {$27.00} for XXXX ...
1,0,Due to inconsistencies in the amount owed that...


In [8]:
def preprocess(review):
    '''
    Simple preprocessing function.
    '''
    res = []

    review = re.sub('\W+',' ', review)
    res = review.split(' ')
    res = res[:-1]

    return res

In [9]:
train_data, test_data = np.split(train_data.sample(frac=1, random_state=42),[int(.7*len(train_data))])

In [10]:
test_data = test_data.values.tolist()
train_data = train_data.values.tolist()

train_data = [(x[0], preprocess(x[1])) for x in train_data]
test_data = [(x[0], preprocess(x[1])) for x in test_data]

# msg = preprocess(train_data[0])
# print(msg)
#len(train_data)

In [11]:
for x in random.sample(train_data, 2):
    print('Sample text:', x[1])
    print('Sample label:', x[0], '\n')

Sample text: ['The', 'doctors', 'office', 'set', 'up', 'a', 'payment', 'plan', 'They', 'fixed', 'the', 'billing', 'error', 'But', 'trying', 'to', 'reach', 'the', 'debt', 'collector', 'to', 'verify', 'this', 'collection', 'notice', 'seems', 'impossible', 'They', 'will', 'not', 'answer', 'the', 'phone', 'and', 'to', 'find', 'out', 'if', 'they', 'closed', 'the', 'collection', 'on', 'their', 'end', 'to', 'prevent', 'a', 'judgement', 'The', 'automated', 'attendant', 'answers', 'then', 'tells', 'you', 'to', 'call', 'back', 'later', 'I', 'have', 'called', 'multiple', 'times']
Sample label: 0 

Sample text: ['In', 'XXXX', 'of', '2015', 'I', 'applied', 'for', 'a', 'mortgage', 'refinance', 'loan', 'with', 'Pentagon', 'Federal', 'Credit', 'Union', 'the', 'same', 'way', 'as', 'which', 'I', 'had', 'done', 'with', 'the', 'same', 'company', 'for', 'the', 'same', 'loan', 'account', 'in', 'the', 'past', 'Within', 'a', 'couple', 'days', 'I', 'was', 'contacted', 'by', 'a', 'representative', 'who', 'gathe

### Data Loader

In [24]:
PAD = '<PAD>'
END = '<END>'
UNK = '<UNK>'

class TextDataset(data.Dataset):
    def __init__(self, examples, split, threshold, max_len, idx2word=None, word2idx=None):

        self.examples = examples
        assert split in {'train', 'val', 'test'}
        self.split = split
        self.threshold = threshold
        self.max_len = max_len

        # Dictionaries
        self.idx2word = idx2word
        self.word2idx = word2idx
        if split == 'train':
            self.build_dictionary()
        self.vocab_size = len(self.word2idx)
        
        # Convert text to indices
        self.textual_ids = []
        self.convert_text()

    
    def build_dictionary(self): 
        '''
        Build the dictionaries idx2word and word2idx. This is only called when split='train'
        Returns nothing.
        '''
        assert self.split == 'train'
        
        self.idx2word = {0:PAD, 1:END, 2: UNK}
        self.word2idx = {PAD:0, END:1, UNK: 2}

        pre_dict = {PAD:0, END:1, UNK: 2}

        for sentence in self.examples:
          for word in sentence[1]:
            word = word.lower()
            if word not in self.word2idx:
              self.word2idx[word] = self.word2idx.get(word, 0) + 1
            else:
              self.word2idx[word] += 1

        for (k,v) in self.word2idx.items():
          if v >= self.threshold:
            pre_dict[k] = v
          

        self.word2idx = pre_dict

        list_keys = [* self.word2idx.keys()]
        for item in list_keys:
          self.idx2word[list_keys.index(item)]= item
          self.word2idx[item] = list_keys.index(item)


        #print(list(self.word2idx.items())[:10])
        #print(list(self.idx2word.items())[:10])

        pass
    
    def convert_text(self):
        '''
        Convert each email in the dataset to a list of indices, given by self.word2idx.
        Store this in self.textual_ids; returns nothing.
        '''

        for sentence in self.examples:
         temp_l = []
         for word in sentence[1]:
           word = word.lower()
           if word not in self.word2idx:
              word = UNK
           index = self.word2idx.get(word)
           temp_l.append(index)
         word = END
         index = self.word2idx.get(word)
         temp_l.append(index)
         self.textual_ids.append(temp_l)

        pass

    def get_text(self, idx):
        '''
        Return the email at idx as a long tensor (torch.LongTensor) of integers corresponding to the words in the email.
        May need to pad.
        '''
        review = self.textual_ids[idx]
        if len(review) < self.max_len:
          padindex = list(self.word2idx.keys()).index(PAD)
          review += [padindex] * (self.max_len - len(review))

        review = review[:self.max_len]
        return torch.tensor(review, dtype=torch.long, device = None)
        #return review
    
    def get_label(self, idx):
        '''
        This function should return the value 1 if the label/sentiment for idx in the dataset is 'positive', 
        and 0 if it is 'negative'. The return type should be torch.LongTensor.
        '''
        result = self.examples[idx][0]
        retVal = 0
        if result == 'pos':
          retVal = 1

        return torch.as_tensor(retVal, dtype=torch.long, device = None)
        #return torch.as_tensor(result, dtype=torch.long, device = None)

    def __len__(self):
        '''
        Return the number of emails (int value) in the dataset
        '''
        return int(len(self.examples))
    
    def __getitem__(self, idx):
        '''
        Return the email, and label/sentiment of the email specified by idx.
        '''
        rtext = self.get_text(idx)
        rlab = self.get_label(idx)

        return rtext, rlab

if __name__=='__main__':
    # Sample item
    Ds = TextDataset(train_data, 'train', threshold=10, max_len=150)
    print('Vocab size:', Ds.vocab_size)

    text, label = Ds[random.randint(0, len(Ds))]
    print('Example text:', text)
    print('Example label:', label)

Vocab size: 10091
Example text: tensor([   3,   40,  379,   49, 4611,   56,   49,  533,   56, 2065,   54,   27,
          33,  973,  881, 1047, 1672,  434,   15, 1039, 3113,   56, 3221,  141,
           3, 5469,   44,  822,  260,   13,   49, 1982,   45,   26,  272,    3,
         137,   49, 3113,   33,  229, 1239,  384,   49, 3223,   51,   54,  356,
        1239,   26,   15, 1428,   45,   41,   15,  521,   56,  159,  892,   27,
         126,   33,  640,  343,  788,    7, 2498,  935,    1,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        

### Train a Convolutional Neural Network (CNN)

In [25]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, out_channels, filter_heights, stride, dropout, num_classes, pad_idx):
        super(CNN, self).__init__()
        
        ##### TODO #####
        # Create an embedding layer to represent the words in vocabulary. 
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx= pad_idx)         

        # Define multiple Convolution layers (nn.Conv2d)  
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = out_channels,
                                              stride = stride, 
                                              kernel_size = (fs, embed_size)) 
                                    for fs in filter_heights
                                    ])         

        # Create a dropout layer (nn.Dropout) using dropout
        self.dropout = nn.Dropout(dropout)        

        # Define a linear layer (nn.Linear) that consists of num_classes units 
        #   and takes as input the concatenated output for all cnn layers (out_channels * num_of_cnn_layers units)
        self.fc = nn.Linear(len(filter_heights) * out_channels, num_classes)        


    def forward(self, texts):
        """
        texts: LongTensor [batch_size, max_len]
        
        Returns output: Tensor [batch_size, num_classes]
        """
        # Pass texts through your embedding layer to convert from word ids to word embeddings
        embedded_texts = self.embedding(texts)        

        # Input to conv should have 1 channel
        embedded_texts = embedded_texts.unsqueeze(1)        
        
        conved = [F.relu(conv(embedded_texts)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]        

        # Apply dropout
        cat = self.dropout(torch.cat(pooled, dim = 1))        

        return self.fc(cat)

### Train CNN Model

In [26]:
if __name__=='__main__':
    THRESHOLD = 5 
    MAX_LEN = 100 
    BATCH_SIZE = 32 

    train_Ds = TextDataset(train_data, 'train', THRESHOLD, MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_Ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)

    test_Ds = TextDataset(test_data, 'test', THRESHOLD, MAX_LEN, train_Ds.idx2word, train_Ds.word2idx)
    test_loader = torch.utils.data.DataLoader(test_Ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)


In [27]:
from tqdm.notebook import tqdm

In [28]:
def train_model(model, num_epochs, data_loader, optimizer, criterion):
    print('Training Model...')
    model.train()
    for epoch in tqdm(range(num_epochs)):
        epoch_loss = 0
        epoch_acc = 0
        for texts, labels in data_loader:
            texts = texts.to(device) # shape: [batch_size, MAX_LEN]
            labels = labels.to(device) # shape: [batch_size]

            optimizer.zero_grad()

            output = model(texts)
            acc = accuracy(output, labels)
            
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        print('[TRAIN]\t Epoch: {:2d}\t Loss: {:.4f}\t Train Accuracy: {:.2f}%'.format(epoch+1, epoch_loss/len(data_loader), 100*epoch_acc/len(data_loader)))
    print('Model Trained!\n')

In [29]:
def count_parameters(model):
    """
    Count number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def accuracy(output, labels):
    """
    Returns accuracy per batch
    output: Tensor [batch_size, n_classes]
    labels: LongTensor [batch_size]
    """
    preds = output.argmax(dim=1) # find predicted class
    correct = (preds == labels).sum().float() # convert into float for division 
    acc = correct / len(labels)
    return acc

In [30]:
if __name__=='__main__':
    cnn_model = CNN(vocab_size = train_Ds.vocab_size, 
                embed_size = 128, 
                out_channels = 64, 
                filter_heights = [2, 3, 4], 
                stride = 1, 
                dropout = 0.5, 
                num_classes = 2, # change this to 3, later
                pad_idx = train_Ds.word2idx[PAD])

    # Put model on the device (cuda or cpu)
    cnn_model = cnn_model.to(device)
    
    print('The model has {:,d} trainable parameters'.format(count_parameters(cnn_model)))


The model has 1,840,834 trainable parameters


In [31]:
if __name__=='__main__':    
    LEARNING_RATE = 5e-4 # Feel free to try other learning rates

    # Define the loss function
    criterion = nn.CrossEntropyLoss().to(device)

    # Define the optimizer
    optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)

In [32]:
if __name__=='__main__':    
    N_EPOCHS = 7 #20 
    
    # train model for N_EPOCHS epochs
    train_model(cnn_model, N_EPOCHS, train_loader, optimizer, criterion)


Training Model...


  0%|          | 0/7 [00:00<?, ?it/s]

[TRAIN]	 Epoch:  1	 Loss: 0.0012	 Train Accuracy: 99.98%
[TRAIN]	 Epoch:  2	 Loss: 0.0000	 Train Accuracy: 100.00%
[TRAIN]	 Epoch:  3	 Loss: 0.0000	 Train Accuracy: 100.00%
[TRAIN]	 Epoch:  4	 Loss: 0.0000	 Train Accuracy: 100.00%
[TRAIN]	 Epoch:  5	 Loss: 0.0000	 Train Accuracy: 100.00%
[TRAIN]	 Epoch:  6	 Loss: 0.0000	 Train Accuracy: 100.00%
[TRAIN]	 Epoch:  7	 Loss: 0.0000	 Train Accuracy: 100.00%
Model Trained!



### Evaluate CNN Model

In [33]:
import random

def evaluate(model, data_loader, criterion):
    print('Evaluating performance on the test dataset...')
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    print("\nSOME PREDICTIONS FROM THE MODEL:")
    for texts, labels in tqdm(data_loader):
        texts = texts.to(device)
        labels = labels.to(device)
        
        output = model(texts)
        acc = accuracy(output, labels)
        pred = output.argmax(dim=1)
        all_predictions.append(pred)
        
        loss = criterion(output, labels)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        if random.random() < 0.0015:
            print("Input: "+' '.join([data_loader.dataset.idx2word[idx] for idx in texts[0].tolist() if idx not in {data_loader.dataset.word2idx[PAD], data_loader.dataset.word2idx[END]}]))
            print("Prediction:", pred.item(), '\tCorrect Output:', labels.item(), '\n')

    full_acc = 100*epoch_acc/len(data_loader)
    full_loss = epoch_loss/len(data_loader)
    print('[TEST]\t Loss: {:.4f}\t Accuracy: {:.2f}%'.format(full_loss, full_acc))
    predictions = torch.cat(all_predictions)
    return predictions, full_acc, full_loss

if __name__=='__main__':
    evaluate(cnn_model, test_loader, criterion) # Compute test data accuracy


Evaluating performance on the test dataset...

SOME PREDICTIONS FROM THE MODEL:


  0%|          | 0/20042 [00:00<?, ?it/s]

Input: i have a mortgage through wells fargo after several months i saw an offer from their website saying i could obtain significant savings on interest and cut years off my mortgage by changing from monthly to biweekly payments i elected to do so and only months later did i learn that the bank merely holds the first payment and does not apply it until the xxxx payment arrives i was led to believe that payments had to be applied when received i sent them an email saying i believed their practice is deceptive at best and predatory at worst the
Prediction: 0 	Correct Output: 0 

Input: i did manage to read between the lines today xxxx xxxx 2015 on the experian website and it would appear that by asking for an initial 90 day alert within 30 days of expiration of the initial fraud alert then the corporation gives an additional 90 day extension and alerts the other xxxx so i did that and experian assigned the number xxxx to my request for initial 90 day alert which is actually my request f

### Export Model

In [34]:
if __name__=='__main__':
    from google.colab import drive
    drive.mount('/content/drive')
    print()

    try:
        cnn_model is None
        cnn_exists = True
    except:
        cnn_exists = False

   
    if cnn_exists:
        print("Saving CNN model....") 
        torch.save(cnn_model, "drive/My Drive/cnn.pt")
    
    print("Done!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Saving CNN model....
Done!
