In [1]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip

--2021-12-02 08:31:22--  https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163567 (160K) [application/x-httpd-php]
Saving to: ‘YouTube-Spam-Collection-v1.zip’


2021-12-02 08:31:23 (412 KB/s) - ‘YouTube-Spam-Collection-v1.zip’ saved [163567/163567]



In [2]:
import os

import time
import datetime

import torch
import torch.optim as O
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchtext.utils import extract_archive
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy
import re
from sklearn.model_selection import train_test_split
import logging
from argparse import ArgumentParser

from pdb import set_trace

import pandas as pd

In [3]:
def get_device(gpu_no):
  if torch.cuda.is_available():
   return torch.device('cuda',gpu_no)
  else:
    return torch.device('cpu')

In [4]:
device = get_device(0)
print(device)

cuda:0


In [5]:
spacy_en = spacy.load('en')

In [6]:
#https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html

data_file = "YouTube-Spam-Collection-v1.zip"
paths = extract_archive(data_file,"data")

for file in paths:
  if re.search('__MACOS',file):
    paths.remove(file)
paths

['data/Youtube01-Psy.csv',
 'data/Youtube02-KatyPerry.csv',
 'data/Youtube03-LMFAO.csv',
 'data/Youtube04-Eminem.csv',
 'data/Youtube05-Shakira.csv']

# Dataset 

In [7]:
class Dataset():
  def __init__(self,paths):
    paths.sort()
    self.df,self.train_df,self.val_df = self.create_df(paths)
    
    self.CONTENT = Field(sequential=True, tokenize=self.tokenizer, lower=True)
    self.CLASS = Field(dtype = torch.float,batch_first=True)

    self.load_dataset()
    
    # Create embeddings
    self.create_embeddings()
    
    self.divide_batch()
  def tokenizer(self,sent):
    return [token.text for token in spacy_en.tokenizer(sent) ]

  def create_embeddings(self):
    self.CONTENT.build_vocab(self.train_ds, vectors='glove.twitter.27B.100d')
    self.CLASS.build_vocab(self.train_ds)
  
  def create_df(self,paths):
    dfs = list()
    for i in range(len(paths)):
      dfs.append(pd.read_csv(paths[i], header=0))
    df = pd.concat(dfs)
    df.drop(list(df.columns[0:3]),axis='columns',inplace=True)
    train_df, val_df = train_test_split(df, shuffle=True, test_size=0.2, random_state=42)
    return df,train_df,val_df
  
  def load_dataset(self):
      self.train_df.to_csv('train_df.tsv',sep='\t',index=False)
      self.val_df.to_csv('val_df.tsv',sep='\t',index=False)
      fields = [
    ('CONTENT', self.CONTENT),
    ('CLASS', self.CLASS)
      ]
      self.train_ds, self.valid_ds = TabularDataset.splits(
      path = '',
      train = 'train_df.tsv',
      validation = 'val_df.tsv',
      format = 'tsv',
      fields = fields,
      skip_header = True
    )

  def divide_batch(self):    
      self.train_iter, self.val_iter = BucketIterator.splits(
    (self.train_ds, self.valid_ds),
    sort = False,
    batch_size = 32,
    device = get_device(0)
  )

# Model Architecture

![model_architecture.jpg](https://drive.google.com/uc?id=1pG1nHTkWGueGSg6YMvNDjl_RphUqHPpm)

In [8]:
# https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
    self.i2o = nn.Linear(input_size + hidden_size, output_size)
  def forward(self, input, hidden):
    combined = torch.cat((input, hidden), -1)
    hidden = self.i2h(combined)
    output = self.i2o(combined)
    # output = self.softmax(output)
    # output = torch.argmax(output, dim=1)
    return output, hidden
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
class CNN(nn.Module):
  def __init__(self,input_size,output_size):
    super(CNN,self).__init__()
    self.conv1 = nn.Conv1d(1, 6, 5)
    self.pool = nn.MaxPool1d(2)
    self.conv2 = nn.Conv1d(6, 6, 5)
    self.fc1 = nn.Linear(132, 64)
    self.fc2 = nn.Linear(64, 16)
    self.fc3 = nn.Linear(16, output_size)
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = torch.flatten(x, 1) # flatten all dimensions except batch
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    x = self.softmax(x)
    return x  

In [9]:
dataset = Dataset(paths)
params = {
    'target_size':2,
    'input_size':100,
    'epochs':40,
    'lr':1e-4,
    'results_dir':'results_dir'
    }
embeddings = {
    'CONTENT_emb':dataset.CONTENT.vocab.vectors.to(device),
    }


.vector_cache/glove.twitter.27B.zip: 1.52GB [06:58, 3.63MB/s]                            
100%|█████████▉| 1193513/1193514 [01:03<00:00, 18699.94it/s]


In [10]:
#Utilities
def labels_gold(gold):
  labels = {0:2}
  i=1
  while(len(labels.keys())<2):
    if gold[i] != labels[0]: 
        labels[1] = gold[i]
        break
    i+=1
  return labels

# https://stackoverflow.com/questions/45384684/replace-all-nonzero-values-by-zero-and-all-zero-values-by-a-specific-value
def normalize_gold(gold):
  labels = labels_gold(gold)
  res = gold.clone()
  res[gold == labels[1]] = 1.
  
  res[gold == labels[0]] = 0.
  return res.long().squeeze()

def makedirs(name):
	"""helper function for python 2 and 3 to call os.makedirs()
		avoiding an error if the directory to be created already exists"""

	import os, errno

	try:
		os.makedirs(name)
	except OSError as ex:
		if ex.errno == errno.EEXIST and os.path.isdir(name):
			# ignore existing directory
			pass
		else:
			# a different error happened
			raise
def get_logger(params, phase):
	logging.basicConfig(level=logging.INFO, 
												filename = "{}/{}/{}/{}.log".format(params['results_dir'], 'rnn+cnn', 'dataset', phase),
												format = '%(asctime)s - %(message)s', 
												datefmt='%d-%b-%y %H:%M:%S')
	return logging.getLogger(phase)


In [11]:
#Training
class Train():
  def __init__(self, params, embeddings):
    emb_size = params['input_size']
    target_size = params['target_size']
    self.epochs = params['epochs']
    self.results_dir = params['results_dir']
    self.embeddings = embeddings
    
    self.rnn = RNN(input_size=emb_size,hidden_size=emb_size,output_size=emb_size)
    self.cnn = CNN(emb_size,target_size)
    self.rnn.to(device)
    self.cnn.to(device)
    self.dataset = dataset
    self.opt = O.Adam((list(self.rnn.parameters())+list(self.cnn.parameters())), lr = params['lr'])
    self.scheduler = ReduceLROnPlateau(optimizer=self.opt, mode='min',factor=1e-1,patience=5)
    self.best_val_acc = None
    self.logger = get_logger(params,'train')
  
  def train(self):
    self.rnn.train(); self.dataset.train_iter.init_epoch()
    n_correct, n_total, n_loss = 0, 0, 0
    criterion = nn.CrossEntropyLoss(reduction = 'sum')
    for batch in self.dataset.train_iter:
      batch.CONTENT.to(device)
      batch.CLASS.to(device)
      self.opt.zero_grad()
      # batch.CONTENT = batch.CONTENT.reshape((batch.CONTENT.shape[1],batch.CONTENT.shape[0]))
      batch_lines = self.embeddings['CONTENT_emb'][batch.CONTENT]
      hidden = torch.zeros(batch_lines.shape[1:]).to(device)
      outputs = torch.zeros(batch_lines.shape[1:]).to(device)
      
      for batch_word in batch_lines:
        out, hidden = self.rnn(batch_word,hidden)
        outputs += out
      outputs = outputs.reshape(batch.batch_size,1,100)
      output = self.cnn(outputs)  
      batch.CLASS = normalize_gold(batch.CLASS)
      # print(output.shape,batch.CLASS.shape)
      loss = criterion(output, batch.CLASS)
      n_correct += (torch.max(output, 1)[1].view(batch.CLASS.size()) == batch.CLASS).sum().item()
      n_total += batch.batch_size
      n_loss += loss.item()

      loss.backward(); self.opt.step()

    train_loss = n_loss/n_total
    train_acc = 100. * n_correct/n_total
    return train_loss,train_acc
  
  def validate(self):
    self.rnn.eval(); self.cnn.eval(); self.dataset.val_iter.init_epoch()
    n_correct, n_total, n_loss = 0, 0, 0
    criterion = nn.CrossEntropyLoss(reduction = 'sum')
    with torch.no_grad():
      for batch_idx, batch in enumerate(self.dataset.val_iter):
        batch.CONTENT.to(device)
        batch.CLASS.to(device)
        batch_lines = self.embeddings['CONTENT_emb'][batch.CONTENT]
        hidden = torch.zeros(batch_lines.shape[1:]).to(device)
        outputs = torch.zeros(batch_lines.shape[1:]).to(device)
        
        for batch_word in batch_lines:
          out, hidden = self.rnn(batch_word,hidden)
          outputs += out
        outputs = outputs.reshape(batch.batch_size,1,100)
        output = self.cnn(outputs)  
        batch.CLASS = normalize_gold(batch.CLASS)
        loss = criterion(output, batch.CLASS)
        n_correct += (torch.max(output, 1)[1].view(batch.CLASS.size()) == batch.CLASS).sum().item()
        n_total += batch.batch_size
        n_loss += loss.item()
      val_loss = n_loss/n_total
      val_acc = 100. * n_correct/n_total
    return val_loss, val_acc
  
  def execute(self):
    print(" [*] Training starts!")
    print('-' * 99)
    for epoch in range(1, self.epochs+1):
      start = time.time()
      train_loss, train_acc = self.train()
      val_loss, val_acc = self.validate()
      self.scheduler.step(train_loss)
      took = time.time()-start
      
      self.result_checkpoint(epoch, train_loss, val_loss, train_acc, val_acc, took)
      print('| Epoch {:3d} | train loss {:5.2f} | train acc {:5.2f} | val loss {:5.2f} | val acc {:5.2f} | time: {:5.2f}s |'.format(
        epoch, train_loss, train_acc, val_loss, val_acc, took))
    self.finish()
  
  def result_checkpoint(self, epoch, train_loss, val_loss, train_acc, val_acc, took):
    if self.best_val_acc is None or val_acc > self.best_val_acc:
      self.best_val_acc = val_acc
      makedirs('{}/{}/{}'.format(self.results_dir, 'model', 'dataset'))
      torch.save({
        'accuracy': self.best_val_acc,
        # 'options': self.model_options,
        'model_dict': self.rnn.state_dict(),
      }, '{}/{}/{}/best-{}-{}-params.pth'.format(self.results_dir, 'model', 'dataset','rnn', 'dataset'))
    torch.save({
        'accuracy': self.best_val_acc,
        # 'options': self.model_options,
        'model_dict': self.cnn.state_dict(),
      }, '{}/{}/{}/best-{}-{}-params.pth'.format(self.results_dir, 'model', 'dataset','cnn', 'dataset'))
    self.logger.info('| Epoch {:3d} | train loss {:5.2f} | train acc {:5.2f} | val loss {:5.2f} | val acc {:5.2f} | time: {:5.2f}s |'
        .format(epoch, train_loss, train_acc, val_loss, val_acc, took))
  
  def finish(self):
    self.logger.info("[*] Training finished!\n\n")
    print('-' * 99)
    print(" [*] Training finished!")
    print(" [*] Please find the saved model and training log in results_dir")

In [12]:
train = Train(params,embeddings)
train.execute()

 [*] Training starts!
---------------------------------------------------------------------------------------------------
| Epoch   1 | train loss  0.68 | train acc 58.44 | val loss  0.67 | val acc 57.14 | time:  2.68s |
| Epoch   2 | train loss  0.65 | train acc 63.75 | val loss  0.65 | val acc 57.65 | time:  2.23s |
| Epoch   3 | train loss  0.62 | train acc 71.42 | val loss  0.63 | val acc 61.99 | time:  2.30s |
| Epoch   4 | train loss  0.56 | train acc 81.52 | val loss  0.52 | val acc 86.22 | time:  2.35s |
| Epoch   5 | train loss  0.48 | train acc 85.55 | val loss  0.46 | val acc 87.24 | time:  2.34s |
| Epoch   6 | train loss  0.45 | train acc 86.57 | val loss  0.46 | val acc 85.71 | time:  2.33s |
| Epoch   7 | train loss  0.44 | train acc 86.76 | val loss  0.47 | val acc 84.95 | time:  2.33s |
| Epoch   8 | train loss  0.43 | train acc 87.92 | val loss  0.44 | val acc 86.99 | time:  2.37s |
| Epoch   9 | train loss  0.44 | train acc 87.15 | val loss  0.45 | val acc 86.22 | ti

In [13]:
#Zipping the model that gave best val accuracy. 
!zip -r results_dir.zip results_dir

  adding: results_dir/ (stored 0%)
  adding: results_dir/model/ (stored 0%)
  adding: results_dir/model/dataset/ (stored 0%)
  adding: results_dir/model/dataset/best-cnn-dataset-params.pth (deflated 11%)
  adding: results_dir/model/dataset/best-rnn-dataset-params.pth (deflated 8%)
