In [1]:
import re

import pandas as pd
from numpy.random import RandomState

import torchtext
from torchtext.legacy.data import Field
from torchtext.legacy.data import TabularDataset
from torchtext.vocab import GloVe
from torchtext.legacy.data import Iterator, BucketIterator
import torchtext.datasets

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef
from sklearn.model_selection import train_test_split

# load Data

In [2]:
df = pd.read_csv("../data/labeled/combined.csv")
electronics = df.groupby(df.category).get_group("Electronics")
pet = df.groupby(df.category).get_group("Pet supplies")
baby = df.groupby(df.category).get_group("Baby")
sports = df.groupby(df.category).get_group("Sport outdoors")

In [3]:
#%% Prepare the dataset via torchtext
spacy_en = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'textcat'
                                     'entity_ruler', 'sentencizer', 
                                     'merge_noun_chunks', 'merge_entities',
                                     'merge_subtokens'])

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
  
# set up fields
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [4]:
def generate_dataformat(dataframe, TEXT, LABEL):
    dataframe.to_csv(".tmp/file.csv", index=False)
    
    datafield = [('text', TEXT),  ('label', LABEL)]
    dataset = TabularDataset(path ='.tmp/file.csv',  
                             format='csv',
                             skip_header=True,
                             fields=datafield)  
    
    iterator =  Iterator(
        dataset, 
        batch_size=64,
        device=torch.device('cuda'), 
        sort_within_batch=False,
        repeat=False)
    
    
    return iterator, dataset
    

In [5]:
#%% Text CNN model
class textCNN(nn.Module):
    
    def __init__(self, vocab_built, emb_dim, dim_channel, kernel_wins, num_class):
        super(textCNN, self).__init__()
        #load pretrained embedding in embedding layer.
        self.embed = nn.Embedding(len(vocab_built), emb_dim)
        self.embed.weight.data.copy_(vocab_built.vectors)
    
        #Convolutional Layers with different window size kernels
        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, emb_dim)) for w in kernel_wins])
        #Dropout layer
        #self.dropout = nn.Dropout(0.6)
        
        #FC layer
        self.fc = nn.Linear(len(kernel_wins)*dim_channel, num_class)
        
    def forward(self, x):
        emb_x = self.embed(x)
        emb_x = emb_x.unsqueeze(1)

        con_x = [conv(emb_x) for conv in self.convs]

        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in con_x]
        
        fc_x = torch.cat(pool_x, dim=1)
        
        fc_x = fc_x.squeeze(-1)

        #fc_x = self.dropout(fc_x)
        logit = self.fc(fc_x)
        return logit
        

#%% Training the Model
def train(model, device, train_itr, optimizer, epoch, max_epoch):
    model.train()
    corrects, train_loss = 0.0,0
    for batch in train_itr:
        text, target = batch.text, batch.label
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        optimizer.zero_grad()
        logit = model(text)
        
        loss = F.cross_entropy(logit, target)
        loss.backward()
        optimizer.step()
        
        train_loss+= loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()
    
    size = len(train_itr.dataset)
    train_loss /= size 
    accuracy = 100.0 * corrects/size
  
    return train_loss, accuracy
    
def valid(model, device, test_itr):
    model.eval()
    pred = []
    targets = []
    corrects, test_loss = 0.0,0
    for batch in test_itr:
        text, target = batch.text, batch.label
        text = torch.transpose(text,0, 1)
        target.data.sub_(1)
        text, target = text.to(device), target.to(device)
        
        logit = model(text)
        loss = F.cross_entropy(logit, target)

        test_loss += loss.item()
        result = torch.max(logit,1)[1]
        corrects += (result.view(target.size()).data == target.data).sum()

        targets = targets + target.tolist()
        pred = pred + result.tolist()
    
    size = len(test_itr.dataset)
    test_loss /= size 
    accuracy = 100.0 * corrects/size

    return pred, targets

In [6]:
def evaluation(dataframe_train, dataframe_train_test, verbose=False):
    # Creating field for text and label
    TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
    LABEL = Field(sequential=False)
    #clean the text
    TEXT.preprocessing = torchtext.legacy.data.Pipeline(clean_str)
    
    train_iter, train_dataset = generate_dataformat(dataframe_train, TEXT, LABEL)
    test_iter, test= generate_dataformat(dataframe_train_test, TEXT, LABEL)
    
    TEXT.build_vocab(train_dataset, vectors= 'glove.6B.300d')
    LABEL.build_vocab(train_dataset)

    vocab = TEXT.vocab
    
    model = textCNN(vocab, 300, 100, [3,3,3, 4,4,4 , 5,5,5] , 2).to('cuda')
    if(verbose): print(model)
    
    # Use GPU if it is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #optimizer
    f1 = 0;
    acc = 0;
    mcc = 0;
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(1, 10+1):
        #train loss
        tr_loss, tr_acc = train(model, device, train_iter, optimizer, epoch, 100)
        if(verbose): print('Train Epoch: {} \t Loss: {} \t Accuracy: {}%'.format(epoch, tr_loss, tr_acc))

        pred, targets = valid(model, device, test_iter)
        f1 = f1_score(targets, pred, average="macro")
        acc = accuracy_score(targets, pred)
        mcc = matthews_corrcoef(targets,pred)
        
        if(verbose): print('Valid Epoch: {} \t f1: {}% \t acc: {}'.format(epoch, f1, acc))
    return f1, acc, mcc

In [7]:
from sklearn.model_selection import KFold

In [8]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 2)
data = []

for train_index , test_index in kf.split(baby):
    data_df = baby
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["baby",f1,acc, mcc])
    
for train_index , test_index in kf.split(pet):
    data_df = pet
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["pet",f1,acc, mcc])

for train_index , test_index in kf.split(sports):
    data_df = sports
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["sports",f1,acc, mcc])
    
for train_index , test_index in kf.split(electronics):
    data_df = electronics
    train_df = data_df.iloc[train_index]
    test_df =  data_df.iloc[test_index]
    f1, acc, mcc = evaluation(train_df, test_df)
    data.append(["electronics",f1,acc, mcc])
    
df_result = pd.DataFrame(data, columns = ['category', 'f1-score', 'accuracy', 'matthews-corr'])

In [9]:
df_result.groupby(df_result.category).mean()

Unnamed: 0_level_0,f1-score,accuracy,matthews-corr
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baby,0.785389,0.792,0.575294
electronics,0.753665,0.77139,0.517761
pet,0.79355,0.8265,0.591958
sports,0.749745,0.766875,0.501422


In [10]:
df_result.to_csv('../results/cnn.csv', index=False)