# Time for some big brain

## Step 1: Tokenize data and build vocab

In [27]:
import pandas as pd
import re
import nltk

In [28]:
"""
    Load dataset from local directory
"""
def load_dataset(filename):
    df = pd.read_csv(filename)
    
    return df['headline'].values, df['clickbait'].values 

In [29]:
"""
    Clean data by removing all special characters. Convert words to lowercase
"""
def clean_data(headlines):
    headlines = [re.sub(r'\'','', headline).lower() for headline in headlines]
    headlines = [re.sub(r'[^A-Za-z0-9]+',' ', headline).lower() for headline in headlines]
    return headlines

In [30]:
"""
    Tokenize all headlines
"""
def tokenization(headlines):
    headlines = [nltk.tokenize.wordpunct_tokenize(headline) for headline in headlines]
    return headlines

In [31]:
"""
    Build vocab and return 'vocabSize' most common words
"""
def build_vocab(headlines, vocabSize):
    vocab = dict()

    fdist = nltk.FreqDist()
    for headline in headlines:
        for word in headline:
            fdist[word] += 1
    
    common_words = fdist.most_common(vocabSize)

    for count,word in enumerate(common_words):
        vocab[word[0]] = count+1
    
    return vocab

In [32]:
"""
    Convert each token into index based representation 
"""
def word_to_idx(headlines, vocab):	
  headlines_tokenized = list()
  
  for sentence in headlines:
    temp = list()
    for word in sentence:
       if word in vocab.keys():
          temp.append(vocab[word])
    headlines_tokenized.append(temp)
  return headlines_tokenized

In [34]:
# Load data
headlines, clickbait = load_dataset("clickbait_data.csv")

# Clean data and build vocab
headlines = clean_data(headlines)
headlines = tokenization(headlines)
vocab = build_vocab(headlines,100)
headlines_tokenized = word_to_idx(headlines, vocab)
#print(headlines_tokenized)
print(vocab)



{'to': 1, 'in': 2, 'the': 3, 'of': 4, 'you': 5, 'a': 6, 'for': 7, 'and': 8, 'on': 9, 'your': 10, 'is': 11, 'are': 12, 'that': 13, 'this': 14, 'with': 15, 'at': 16, 'will': 17, 'from': 18, 'new': 19, 'about': 20, 'what': 21, 'who': 22, 'people': 23, 'things': 24, 'how': 25, 'which': 26, 'us': 27, 'as': 28, 'can': 29, 'by': 30, 'make': 31, 'we': 32, 'know': 33, 'be': 34, 'after': 35, '17': 36, 'do': 37, '21': 38, 's': 39, 'u': 40, 'should': 41, 'have': 42, 'these': 43, 'based': 44, 'all': 45, '19': 46, 'actually': 47, 'up': 48, 'it': 49, 'over': 50, 'their': 51, 'times': 52, 'an': 53, 'out': 54, 'first': 55, 'was': 56, 'its': 57, 'like': 58, 'if': 59, 'one': 60, '2015': 61, 'most': 62, 'or': 63, 'best': 64, 'more': 65, 'life': 66, 'when': 67, 'need': 68, 'heres': 69, 'has': 70, 'time': 71, 'world': 72, 'his': 73, 'just': 74, 'i': 75, '15': 76, 'dead': 77, '23': 78, '18': 79, 'year': 80, 'day': 81, 'her': 82, 'get': 83, 'killed': 84, 'dies': 85, 'ever': 86, 'every': 87, 'two': 88, 'presid