## Experiment 1 ##
### Evaluate using eta ###

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Import Gensim for corpus and model
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import NLTK for stopwords
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the BBC news dataset
data_folder=".../bbc"

folders=["business","entertainment","politics","sport","tech"]
x=[]
y=[]


for i in folders:
    files=os.listdir(data_folder+'/'+i)
    for text_file in files:
        file_path=data_folder + '/'+i+'/'+text_file
        with open(file_path,'rb') as f:
            data=f.read()
        x.append(data)
        y.append(i)
        
data={'text':x,'type':y}
df = pd.DataFrame(data)

In [3]:
# Use gensim's simple preprocess to tokenize words from sentences
def tokenize(sentences, deacc=True):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

In [4]:
# Get stopwords from NLTK's model of english stopwords
stop_words = stopwords.words('english')

# Def function for removing stopwords in each tokenized text body
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
# Separate data and tokenize body texts
data = df['text'].values.tolist()
data_words = list(tokenize(data))

# Filter stopwords from tokenized texts
data_no_stopwords = remove_stopwords(data_words)

# Build dictionary from POS tagged data
dictionary = corpora.Dictionary(data_no_stopwords)

# Build corpus using lemmatized texts
texts = data_no_stopwords
corpus = [dictionary.doc2bow(text) for text in texts]

# Gather labels as a list for passing through train-test split
labels = df['type'].values.tolist()

In [6]:
# Get numeric values for labels
num_lab = labels[:]
num_lab = [0 if x == "business" else x for x in num_lab]
num_lab = [1 if x == "entertainment" else x for x in num_lab]
num_lab = [2 if x == "politics" else x for x in num_lab]
num_lab = [3 if x == "sport" else x for x in num_lab]
num_lab = [4 if x == "tech" else x for x in num_lab]

In [7]:
# Function for making eta hyperparameter
# returns an topics*terms matrix 
def get_eta(topics, terms, prior):
    matrix = np.zeros((5,len(terms)), dtype=int)
    matrix = matrix.tolist()
    
    for n in range(len(prior)):
        for m in range(len(prior[n])):
            matrix[topics[n]][prior[n][m][0]] += prior[n][m][1]
    
    return matrix

In [8]:
eta = get_eta(num_lab, dictionary, corpus)

In [9]:
# Run train test split
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=42)

In [10]:
# Train model using train-test split
LDA_model = gensim.models.ldamodel.LdaModel(corpus=X_train,
                                            id2word=dictionary,
                                            num_topics=5,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto",
                                            eta=eta)

In [11]:
LDA_model.print_topics()

[(0,
  '0.016*"said" + 0.008*"us" + 0.008*"bn" + 0.007*"year" + 0.005*"nthe" + 0.005*"mr" + 0.005*"xc" + 0.005*"xa" + 0.005*"would" + 0.005*"company"'),
 (1,
  '0.011*"film" + 0.011*"said" + 0.009*"best" + 0.006*"year" + 0.006*"music" + 0.005*"also" + 0.005*"one" + 0.005*"us" + 0.005*"show" + 0.005*"nthe"'),
 (2,
  '0.021*"said" + 0.014*"mr" + 0.010*"would" + 0.007*"government" + 0.007*"labour" + 0.006*"people" + 0.006*"election" + 0.006*"party" + 0.005*"blair" + 0.004*"also"'),
 (3,
  '0.010*"said" + 0.005*"first" + 0.005*"game" + 0.005*"year" + 0.005*"england" + 0.005*"win" + 0.004*"time" + 0.004*"two" + 0.004*"one" + 0.004*"last"'),
 (4,
  '0.013*"said" + 0.008*"people" + 0.005*"also" + 0.004*"one" + 0.004*"technology" + 0.004*"new" + 0.004*"mr" + 0.004*"mobile" + 0.004*"would" + 0.004*"nthe"')]

In [12]:
# Function for adding each possible unique pair of testing texts or labels
def nCk_pairs(array):
    paired = []
    
    for n in range(len(array)):
        for k in range(len(array[n:])):
            if k != 0:
                paired.append([array[n],array[n+k]])
    return paired

In [13]:
# Get 2d arrays for pairs of x_test and y_test
testing_labels = nCk_pairs(y_test)
testing_texts = nCk_pairs(X_test)

In [14]:
# Check that both are the same size array (445 choose 2)
print(len(testing_texts))
print(len(testing_labels))

98790
98790


In [15]:
# Function for getting highest predicted cluster of text
def predict_topic(array):
    pred_topic = array[0][0]
    best_similarity = array[0][1]
    
    for n in range(len(array)):
        if array[n][1] > best_similarity:
            pred_topic = array[n][0]
            best_similarity = array[n][1]
    return pred_topic

In [16]:
# Get paired list of predicted clusters, this cell takes a couple of minutes to get all predictions
testing_results = []

for text in range(len(testing_texts)):
    a = predict_topic(LDA_model.get_document_topics(testing_texts[text][0]))
    b = predict_topic(LDA_model.get_document_topics(testing_texts[text][1]))
    testing_results.append([a, b])

In [17]:
# Compare ground truth and prediction 
def get_results(texts, labels):
    # same GT, same pred
    GS_PS = 0
    # same GT, different pred
    GS_PD = 0
    # differnt GT, same pred
    GD_PS = 0
    # different GT, different pred
    GD_PD = 0
    
    for n in range(len(texts)):
        if texts[n][0] == texts[n][1] and labels[n][0] == labels[n][1]:
            GS_PS += 1
        elif texts[n][0] != texts[n][1] and labels[n][0] == labels[n][1]:
            GS_PD += 1
        elif texts[n][0] == texts[n][1] and labels[n][0] != labels[n][1]:
            GD_PS += 1
        elif texts[n][0] != texts[n][1] and labels[n][0] != labels[n][1]:
            GD_PD += 1
   
    precision = GS_PS/(GS_PS+GD_PS)
    recall = GS_PS/(GS_PS+GS_PD)
    F1 = (2*precision*recall)/(precision+recall)
    
    print("--- Results -----------------")
    print("GT same, PRED same: {}".format(GS_PS))
    print("GT same, PRED different: {}".format(GS_PD))
    print("GT different, PRED same: {}".format(GD_PS))
    print("GT different, PRED different: {}".format(GD_PD))
    print("\nPrecision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1 score: {}".format(F1))

In [18]:
get_results(testing_results, testing_labels)

--- Results -----------------
GT same, PRED same: 19720
GT same, PRED different: 552
GT different, PRED same: 493
GT different, PRED different: 78025

Precision: 0.975609756097561
Recall: 0.9727703235990529
F1 score: 0.9741879708534025


The prediction is a bit better than the prediction result in the experiment 1 description.