## Evaluation of base model ##

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Import Gensim for corpus and model
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import NLTK for stopwords
import nltk
from nltk.corpus import stopwords

In [2]:
# Reading the BBC news dataset
data_folder=".../bbc"

folders=["business","entertainment","politics","sport","tech"]
x=[]
y=[]


for i in folders:
    files=os.listdir(data_folder+'/'+i)
    for text_file in files:
        file_path=data_folder + '/'+i+'/'+text_file
        with open(file_path,'rb') as f:
            data=f.read()
        x.append(data)
        y.append(i)
        
data={'text':x,'type':y}
df = pd.DataFrame(data)

In [3]:
# Use gensim's simple preprocess to tokenize words from sentences
def tokenize(sentences, deacc=True):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

In [4]:
# Get stopwords from NLTK's model of english stopwords
stop_words = stopwords.words('english')

# Def function for removing stopwords in each tokenized text body
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
# Separate data and tokenize body texts
data = df['text'].values.tolist()
data_words = list(tokenize(data))

# Filter stopwords from tokenized texts
data_no_stopwords = remove_stopwords(data_words)

# Build dictionary from POS tagged data
dictionary = corpora.Dictionary(data_no_stopwords)

# Build corpus using lemmatized texts
texts = data_no_stopwords
corpus = [dictionary.doc2bow(text) for text in texts]

# Gather labels as a list for passing through train-test split
labels = df['type'].values.tolist()

In [6]:
# Run train test split
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=42)

In [7]:
# Train model using train-test split
LDA_model = gensim.models.ldamodel.LdaModel(corpus=X_train,
                                            id2word=dictionary,
                                            num_topics=5,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto")
                                            #per_word_topics=True)

In [8]:
LDA_model.print_topics()

[(0,
  '0.014*"said" + 0.010*"us" + 0.010*"bn" + 0.010*"year" + 0.009*"xc" + 0.009*"xa" + 0.006*"company" + 0.006*"market" + 0.005*"nthe" + 0.005*"growth"'),
 (1,
  '0.022*"said" + 0.019*"mr" + 0.013*"would" + 0.010*"government" + 0.009*"labour" + 0.007*"election" + 0.007*"party" + 0.006*"blair" + 0.005*"people" + 0.005*"minister"'),
 (2,
  '0.015*"said" + 0.009*"people" + 0.005*"would" + 0.005*"new" + 0.005*"mr" + 0.005*"also" + 0.005*"could" + 0.004*"one" + 0.004*"nthe" + 0.004*"technology"'),
 (3,
  '0.009*"said" + 0.006*"game" + 0.006*"england" + 0.006*"first" + 0.005*"year" + 0.005*"win" + 0.005*"time" + 0.005*"two" + 0.004*"last" + 0.004*"one"'),
 (4,
  '0.012*"film" + 0.010*"best" + 0.007*"said" + 0.007*"year" + 0.006*"one" + 0.006*"also" + 0.005*"show" + 0.005*"music" + 0.004*"us" + 0.004*"awards"')]

In [9]:
# Function for adding each possible unique pair of testing texts or labels
def nCk_pairs(array):
    paired = []
    
    for n in range(len(array)):
        for k in range(len(array[n:])):
            if k != 0:
                paired.append([array[n],array[n+k]])
    return paired

In [10]:
# Get 2d arrays for pairs of x_test and y_test
testing_labels = nCk_pairs(y_test)
testing_texts = nCk_pairs(X_test)

In [11]:
# Check that both are the same size array (445 choose 2)
print(len(testing_texts))
print(len(testing_labels))

98790
98790


In [12]:
# Function for getting highest predicted cluster of text
def predict_topic(array):
    pred_topic = array[0][0]
    best_similarity = array[0][1]
    
    for n in range(len(array)):
        if array[n][1] > best_similarity:
            pred_topic = array[n][0]
            best_similarity = array[n][1]
    return pred_topic

In [13]:
# Get paired list of predicted clusters, this cell takes a couple of minutes to get all predictions
testing_results = []

for text in range(len(testing_texts)):
    a = predict_topic(LDA_model.get_document_topics(testing_texts[text][0]))
    b = predict_topic(LDA_model.get_document_topics(testing_texts[text][1]))
    testing_results.append([a, b])

In [14]:
# Compare ground truth and prediction 
def get_results(texts, labels):
    # same GT, same pred
    GS_PS = 0
    # same GT, different pred
    GS_PD = 0
    # differnt GT, same pred
    GD_PS = 0
    # different GT, different pred
    GD_PD = 0
    
    for n in range(len(texts)):
        if texts[n][0] == texts[n][1] and labels[n][0] == labels[n][1]:
            GS_PS += 1
        elif texts[n][0] != texts[n][1] and labels[n][0] == labels[n][1]:
            GS_PD += 1
        elif texts[n][0] == texts[n][1] and labels[n][0] != labels[n][1]:
            GD_PS += 1
        elif texts[n][0] != texts[n][1] and labels[n][0] != labels[n][1]:
            GD_PD += 1
   
    precision = GS_PS/(GS_PS+GD_PS)
    recall = GS_PS/(GS_PS+GS_PD)
    F1 = (2*precision*recall)/(precision+recall)
    
    print("--- Results -----------------")
    print("GT same, PRED same: {}".format(GS_PS))
    print("GT same, PRED different: {}".format(GS_PD))
    print("GT different, PRED same: {}".format(GD_PS))
    print("GT different, PRED different: {}".format(GD_PD))
    print("\nPrecision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1 score: {}".format(F1))

In [15]:
get_results(testing_results, testing_labels)

--- Results -----------------
GT same, PRED same: 17418
GT same, PRED different: 2854
GT different, PRED same: 2675
GT different, PRED different: 75843

Precision: 0.8668690588762256
Recall: 0.859214680347277
F1 score: 0.8630248978075066


The prediction is substantially better than the prediction result in the assignment description. Additionally, the recall and overall F1 score are good as well.