# Project Description

### TODO
#### We are using a CNN, max pooling, and n-grams (a sequence of n words in a sentence) to construct this project

## Imporant Documentation

### PyTorch Resources:
#####    - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
#####    - https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
#####    - https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
#####    - https://pytorch.org/docs/stable/generated/torch.nn.MaxPool1d.html
#####    - https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
#####    - https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058#:~:text=An%20N%2Dgram%20means%20a,3%2Dgram%20(trigram).

### Step 1: Tokenize data and build vocab

In [13]:
import re
import nltk
import torch
import numpy as np
import pandas as pd

In [10]:
class Preprocessing:
    def __init__(self, vocab_size, seq_len):
        self.file_name = 'clickbait_data.csv'
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.vocab = None
        self.x_tokenized = None
        self.x_padded = None
        self.x = None
        self.y = None

        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

    def load_dataset(self):
        # Load dataset from local directory 
        df = pd.read_csv(self.file_name)
        
        self.x = df['headline'].values
        self.y = df['clickbait'].values

    def clean_data(self):
        # Clean data by removing all special characters. Convert words to lowercase
        self.x = [re.sub(r'\'','', headline).lower() for headline in self.x]
        self.x = [re.sub(r'[^A-Za-z0-9]+',' ', headline).lower() for headline in self.x]

    def tokenization(self):
        # Tokenize all headlines
        self.x = [nltk.tokenize.wordpunct_tokenize(headline) for headline in self.x]

    def build_vocab(self):
        # Build vocab and return 'vocab_size' most common words
        self.vocab = dict()

        fdist = nltk.FreqDist()
        for headline in self.x:
            for word in headline:
                fdist[word] += 1
        
        common_words = fdist.most_common(self.vocab_size)

        for count, word in enumerate(common_words):
            self.vocab[word[0]] = count+1
    
    def word_to_idx(self):	
        # Convert each token into index based representation 
        self.x_tokenized = list()
        
        for sentence in self.x:
            temp = list()
            for word in sentence:
                if word in self.vocab.keys():
                    temp.append(self.vocab[word])
            self.x_tokenized.append(temp)
    
    def padding_sentences(self):
        # Make all sentences equal length. 
        # If sentence is smaller than minimum length, pad it 
        idx = 0
        self.x_padded = list()

        for sentence in self.x_tokenized:
            while len(sentence) < self.seq_len:
                sentence.insert(len(sentence), idx)
            self.x_padded.append(sentence)
            
        self.x_padded = np.array(self.x_padded)

    def split_data(self):
        # Split data into training and testing sets
        trnSize = int(len(self.x) * .8)
        tstSize = int(len(self.x) * .2)
        self.x_train = self.x[:trnSize]
        self.y_train = self.y[:trnSize]
        self.x_test = self.x[trnSize:]
        self.y_test = self.y[trnSize:]


### Step 2: Build the Model

In [None]:
class HeadlineClassifier(torch.nn.ModuleList):
    def __init__(self, params):
        super(HeadlineClassifier, self).__init__()

        self.seq_len = params.seq_len
        self.num_words = params.num_words
        self.embedding_size = params.embedding_size

        # Dropout (used to reduce chance of overfitting by "dropping" units in neural net). Probability (p) set to 0.25
        self.dropout = torch.nn.Dropout(0.25)

        # Kernel sizes for CNN
        self.k1 = 2
        self.k2 = 3
        self.k3 = 4
        self.k4 = 5

        # Output size for convolutions
        self.out_size = params.out_size
        # Number of strides for convolutions
        self.stride = params.stride

        # Embedding layer (lookup table that stores embeddings of a fixed dictionary and size)
        self.embedding = torch.nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)

        # Convolution layers (each is a 1D convolution over an input)
        self.c1 = torch.nn.Conv1d(self.seq_len, self.out_size, self.k1, self.stride)
        self.c2 = torch.nn.Conv1d(self.seq_len, self.out_size, self.k2, self.stride)
        self.c3 = torch.nn.Conv1d(self.seq_len, self.out_size, self.k3, self.stride)
        self.c4 = torch.nn.Conv1d(self.seq_len, self.out_size, self.k4, self.stride)
            
        # Max pooling layers (each applies 1D max pooling to input) 
        self.pool_1 = torch.nn.MaxPool1d(self.k1, self.stride)
        self.pool_2 = torch.nn.MaxPool1d(self.k2, self.stride)
        self.pool_3 = torch.nn.MaxPool1d(self.k3, self.stride)
        self.pool_4 = torch.nn.MaxPool1d(self.k4, self.stride)

        # Fully connected layer (applies linear transformation to data)
        self.fc = torch.nn.Linear(self.in_features_fc(), 1)

In [16]:
# Initialize instance of Preprocessing class
dataset = Preprocessing(20, 30)

# Load dataset
dataset.load_dataset()

# Clean and tokenize dataset
dataset.clean_data()
dataset.tokenization()

# Build vocab 
dataset.build_vocab()

# Index words and pad headline sentences
dataset.word_to_idx()
dataset.padding_sentences()

# Split data into training and testing sets
dataset.split_data()

# Print data specifics:
print("Dataset size: ", len(dataset.x))
print("Dataset vocab: ", dataset.vocab)
print("Dataset training size: ", len(dataset.y_train))
print("Dataset testing size: ", len(dataset.x_test))


Dataset size:  32000
Dataset vocab:  {'to': 1, 'in': 2, 'the': 3, 'of': 4, 'you': 5, 'a': 6, 'for': 7, 'and': 8, 'on': 9, 'your': 10, 'is': 11, 'are': 12, 'that': 13, 'this': 14, 'with': 15, 'at': 16, 'will': 17, 'from': 18, 'new': 19, 'about': 20}
Dataset training size:  25600
Dataset testing size:  6400
