## Experiment 2 ##
### predicting class of new articles with second model ###

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Import Gensim for corpus and model
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import NLTK for stopwords
import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the BBC news dataset
data_folder="C:/users/funny/bbc"

folders=["business","entertainment","politics","sport","tech"]
x=[]
y=[]


for i in folders:
    files=os.listdir(data_folder+'/'+i)
    for text_file in files:
        file_path=data_folder + '/'+i+'/'+text_file
        with open(file_path,'rb') as f:
            data=f.read()
        x.append(data)
        y.append(i)
        
data={'text':x,'type':y}
df = pd.DataFrame(data)

In [3]:
# Use gensim's simple preprocess to tokenize words from sentences
def tokenize(sentences, deacc=True):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence)))

In [4]:
# Get stopwords from NLTK's model of english stopwords
stop_words = stopwords.words('english')

# Def function for removing stopwords in each tokenized text body
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
# Separate data and tokenize body texts
data = df['text'].values.tolist()
data_words = list(tokenize(data))

# Filter stopwords from tokenized texts
data_no_stopwords = remove_stopwords(data_words)

# Build dictionary from POS tagged data
dictionary = corpora.Dictionary(data_no_stopwords)

# Build corpus using lemmatized texts
texts = data_no_stopwords
corpus = [dictionary.doc2bow(text) for text in texts]

# Gather labels as a list for passing through train-test split
labels = df['type'].values.tolist()

In [6]:
# Get numeric values for labels
num_lab = labels[:]
num_lab = [0 if x == "business" else x for x in num_lab]
num_lab = [1 if x == "entertainment" else x for x in num_lab]
num_lab = [2 if x == "politics" else x for x in num_lab]
num_lab = [3 if x == "sport" else x for x in num_lab]
num_lab = [4 if x == "tech" else x for x in num_lab]

In [7]:
# Function for making eta hyperparameter
# returns an topics*terms matrix 
def get_eta(topics, terms, prior):
    matrix = np.zeros((5,len(terms)), dtype=int)
    matrix = matrix.tolist()
    
    for n in range(len(prior)):
        for m in range(len(prior[n])):
            matrix[topics[n]][prior[n][m][0]] += prior[n][m][1]
    
    return matrix

In [8]:
eta = get_eta(num_lab, dictionary, corpus)

In [9]:
# Run train test split
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=42)

In [10]:
# Train model using train-test split
LDA_model = gensim.models.ldamodel.LdaModel(corpus=X_train,
                                            id2word=dictionary,
                                            num_topics=5,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto",
                                            eta=eta)

In [11]:
# Function for getting highest predicted cluster of text
def predict_topic(array):
    pred_topic = array[0][0]
    best_similarity = array[0][1]
    
    for n in range(len(array)):
        if array[n][1] > best_similarity:
            pred_topic = array[n][0]
            best_similarity = array[n][1]
    return pred_topic

In [12]:
# Reading newly scraped articles from the BBC
data_folder="C:/users/funny/bbc_scraped"

folders=["business","entertainment","politics","sport","tech"]
x=[]
y=[]


for i in folders:
    files=os.listdir(data_folder+'/'+i)
    for text_file in files:
        file_path=data_folder + '/'+i+'/'+text_file
        with open(file_path,'rb') as f:
            data=f.read()
        x.append(data)
        y.append(i)
        
data={'text':x,'type':y}
df = pd.DataFrame(data)

In [13]:
df

Unnamed: 0,text,type
0,b'EU signs US gas deal to curb reliance on Rus...,business
1,b'US jobless claims at lowest level since 1969...,business
2,b'Calls for P&O Ferries boss Peter Hebblethwai...,business
3,"b""Watch: Beyonc\xc3\xa9's Oscar performance in...",entertainment
4,b'Charity boss on her thoughts for Jada Pinket...,entertainment
5,b'Colin Paterson: I\'m amazed Smith came to th...,entertainment
6,"b'Ukraine: No Russia regime change plans, says...",politics
7,"b'Ukraine not alone in fight against Russia, s...",politics
8,b'South Sudan forces withdraw from VP Machar\'...,politics
9,b'Joe Root wants to stay as England captain de...,sport


In [14]:
# Separate data and tokenize body texts
data = df['text'].values.tolist()
data_words = list(tokenize(data))

# Filter stopwords from tokenized texts
data_no_stopwords = remove_stopwords(data_words)

# Build dictionary from POS tagged data
#dictionary = corpora.Dictionary(data_no_stopwords)

# Build corpus using lemmatized texts
texts = data_no_stopwords
new_corpus = [dictionary.doc2bow(text) for text in texts]

# Gather labels as a list for passing through train-test split
labels = df['type'].values.tolist()

# Get numeric values for labels
num_lab = labels[:]
num_lab = [0 if x == "business" else x for x in num_lab]
num_lab = [1 if x == "entertainment" else x for x in num_lab]
num_lab = [2 if x == "politics" else x for x in num_lab]
num_lab = [3 if x == "sport" else x for x in num_lab]
num_lab = [4 if x == "tech" else x for x in num_lab]

In [15]:
len(new_corpus)

15

In [16]:
LDA_model.get_document_topics(new_corpus[14])

[(2, 0.412531), (3, 0.045850545), (4, 0.541382)]

In [17]:
# predict the classes of the new articles
check_class = []

for n in range(len(new_corpus)):
    a = predict_topic(LDA_model.get_document_topics(new_corpus[n]))
    check_class.append(a)

In [18]:
correct = 0
incorrect = 0

for n in range(len(check_class)):
    
    if check_class[n] == num_lab[n]:
        print("Article predicted correctly!")
        correct += 1
    elif check_class[n] != num_lab[n]:
        print("Article predicted incorrectly.")
        incorrect += 1
print("\nNumber of new articles predicted correctly: {}".format(correct))
print("\nNumber of new articles predicted incorrectly: {}".format(incorrect))

Article predicted correctly!
Article predicted correctly!
Article predicted incorrectly.
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted correctly!
Article predicted incorrectly.
Article predicted correctly!
Article predicted correctly!

Number of new articles predicted correctly: 13

Number of new articles predicted incorrectly: 2


Even though the new articles are over a decade newer, the model was still able to predict with a decent success rate.