In [28]:
import sys
import os

from sklearn.naive_bayes import BernoulliNB


import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Custom modules 
from modules import preprocessing as pp
from modules import graph


In [12]:
from bs4 import BeautifulSoup
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

In [13]:
#import datat
df = pd.read_csv("../../src/data/df_one_plus_three.csv")

#inspect the first few rows
df.head(3)

Unnamed: 0,title,dataset,target
0,Bill Changing Credit Card Rules Is Sent to Oba...,1,0
1,"In Hollywood, the Easy-Money Generation Toughe...",1,0
2,1700 runners still unaccounted for in UK's Lak...,1,0


In [14]:
df['title'] = df['title'].apply(cleanText)

In [16]:
df.title = df.title.apply(pp.remove_stopwords)

In [20]:
df.title = df.title.apply(pp.remove_contractions)

In [21]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
import nltk
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens


train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['title']), tags=[r.target]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['title']), tags=[r.target]), axis=1)

In [22]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [23]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 36076/36076 [00:00<00:00, 1047131.97it/s]


In [24]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 36076/36076 [00:00<00:00, 942600.11it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1860056.19it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1970051.05it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2000895.38it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1792965.19it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2072506.66it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2127493.37it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1933647.41it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2075633.90it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2000551.47it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2078199.58it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1902862.35it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1188153.39it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2105908.13it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2010252.44it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2088554.86it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1902958.07it/s

CPU times: user 1min 7s, sys: 10.1 s, total: 1min 17s
Wall time: 1min 1s


In [26]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [29]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = BernoulliNB(alpha=0.01)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.7122623205277454
Testing F1 score: 0.7103591986198382


In [30]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 36076/36076 [00:00<00:00, 1102628.51it/s]


In [31]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 36076/36076 [00:00<00:00, 1595866.85it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1108192.49it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1693626.93it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1783728.76it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1863148.12it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1541909.14it/s]
100%|██████████| 36076/36076 [00:00<00:00, 2076061.07it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1708794.03it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1742382.36it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1758521.15it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1940815.13it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1080534.37it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1271041.78it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1904562.87it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1749918.60it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1059493.70it/s]
100%|██████████| 36076/36076 [00:00<00:00, 1716333.88it/

CPU times: user 2min 3s, sys: 14 s, total: 2min 17s
Wall time: 2min 10s


In [32]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.42387789419221317
Testing F1 score: 0.31118262999365315


In [33]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [36]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [37]:
def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [38]:
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.4411460354417281
Testing F1 score: 0.33472492448619645


# Make a validation dataset:


This dataframe is the combination of both `dataset 1` and `dataset 3` outlined in the README and created in `./notebooks/EDA/datasetCreation`. Currently, with minimal preprocessing and a simple Naive Bayes Classifier I was able to achieve an accuracy of 0.88 and an f1 of 0.87. My goal here is to improve that score by implementing some data cleaning steps prior to tokenization. Then, once I am convinced that my data prep steps are working, I will move onto implementing word imbeddings.

In [None]:
# Remove non_ascii characters
df['title_cleaned'] = df.title.apply(pp.remove_non_ascii_chars)
# Lowercase the words
df.title_cleaned = df.title_cleaned.apply(pp.lower_case)
# Remove contractions
df.title_cleaned = df.title_cleaned.apply(pp.remove_contractions)
# Remove stopwords
df.title_cleaned = df.title_cleaned.apply(remove_stopwords)
# Remove spelling mistakes 
df.title_cleaned = df.title_cleaned.apply(pp.fix_spelling_mistakes)

# Remove all punctuation
df.title_cleaned = df.title_cleaned.apply(pp.remove_punctuation)

In [None]:
with open('spellcorrected.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)

In [None]:
title_cleaned = graph.get_vocab_length(df.title_cleaned)

In [None]:
title_dict= graph.get_vocab_length(df.title)

In [None]:
# Remove stopwords 
df.title_cleaned = df.title_cleaned.apply(pp.remove_stopwords)
df.title_cleaned

In [None]:
def remove_stopwords(title):
    return " ".join([word.lower() for word in tokenizer.tokenize(title) if word.lower() not in stop_words])
df.title_cleaned = df.title_cleaned.apply(remove_stopwords)

In [None]:
title_cleaned = graph.get_vocab_length(df.title_cleaned)

In [None]:
df['title_cleaned_lem'] = df.title_cleaned.apply(pp.lemmetise_series)


In [None]:
title_cleaned_lem = graph.get_vocab_length(df.title_cleaned_lem)

In [None]:
graph.show_wordcloud(title_cleaned_lem, title="Lemmetised Word Cloud")

In [None]:
graph.show_wordcloud(title_cleaned, title="Non Lemmetised Word Cloud")

It looks like Lemmetiser will reduce "US" down to u. Which for our purposes is absolutely fine. we reduced the number of words in our corpus from 42k down to 31k. I think that we are now in a position to train test split and run a simple model on the data

In [None]:

X = df.title_cleaned
y = df.target
X_train_lem, X_test_lem, y_train, y_test = train_test_split(X, y,
                                                            random_state=42, test_size=0.2,
                                                            stratify = df[['target', 'dataset']])

X_train_lem.shape, X_test_lem.shape, y_train.shape, y_test.shape

In [None]:
tfidf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2))
X_train_lem_tfidf = tfidf.fit_transform(X_train_lem)
X_test_lem_tfidf = tfidf.transform(X_test_lem)

In [None]:
bayes_clf = BernoulliNB(alpha = 0.4)

bayes_clf.fit(X_train_lem_tfidf, y_train)
y_hat_lem_train = bayes_clf.predict(X_train_lem_tfidf)
y_hat_lem_test = bayes_clf.predict(X_test_lem_tfidf)

accuracy_score(y_train, y_hat_lem_train), accuracy_score(y_test, y_hat_lem_test)

# Ok, so spell checking everyword is probably not worth it. What if we do doc2vec?