## Data Preprocessing

In [82]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize
import nltk
nltk.download('wordnet')
nltk.download('punkt')

from nltk import pos_tag_sents
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [83]:
import os
import nltk
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
%matplotlib inline
stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
#reading data from csv file
df = pd.read_csv('BBC-articles.csv')
df.head(3)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...


In [85]:
# words preprocessing 
data = df.text.values.tolist()

# Get words from sentences
def getWordsFromSentence(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(getWordsFromSentence(data))


# Functions for small words, stopwords and lemmatization
def remove_small_words(texts):
    return [[w for w in simple_preprocess(str(doc)) if len(w) > 2] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatize(texts):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(w) for w in simple_preprocess(str(doc))] for doc in texts]


# Do lemmatization
tokens = lemmatize(remove_stopwords(data_words))

# Remove one letter and two letter words 
tokens = remove_small_words(tokens)

my_dict = Dictionary(tokens)

In [86]:
# Count Vectorization
dtm = [my_dict.doc2bow(doc) for doc in tokens]
    
# TF-IDF Vectorization
tfidf = TfidfModel(dtm)
tfidf = tfidf[dtm]

In [87]:
from gensim.models import LsiModel, LdaModel
# LSI with TF-IDF Vector
lsi_tfidf = LsiModel(corpus=tfidf, id2word=my_dict, num_topics=5)
# LDA with TF-IDF Vector
lda_tfidf = LdaModel(corpus=tfidf, id2word=my_dict, num_topics=5)

In [88]:
# Get dominant topic and corresponding keywords for each article

def getKeywordsFromDominantTopic(model, corpus, texts): 
    # Init output
    topickeyword_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topickeyword_df = topickeyword_df.append(pd.Series([topic_keywords]), ignore_index=True)
            else:
                break
    return(topickeyword_df)

# Assign the keywords for each vectorization and model combination
df['LSI TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lsi_tfidf, corpus=tfidf, texts=df.text)
df['LDA TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lda_tfidf, corpus=tfidf, texts=df.text)
df.head(3)


Unnamed: 0,category,text,LSI TF-IDF Keywords,LDA TF-IDF Keywords
0,tech,tv future in the hands of viewers with home th...,"mobile, phone, film, award, england, best, tec...","blair, bank, election, party, sale, dollar, la..."
1,business,worldcom boss left books alone former worldc...,"film, award, best, oscar, england, actor, game...","blair, bank, election, party, sale, dollar, la..."
2,sport,tigers wary of farrell gamble leicester say ...,"film, award, best, oscar, england, actor, game...","mobile, phone, search, game, award, film, play..."


In [89]:
# Get 5 most common keywords across the LSI group of keywords
from collections import Counter 
for i in df.index:
    words = df.loc[i, 'LSI TF-IDF Keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words(LSI(I))'] = ','.join([word[0] for word in most_occur])

for i in df.index:
    words = df.loc[i, 'LDA TF-IDF Keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words(LDA(I))'] = ','.join([word[0] for word in most_occur]) 
    
df = df.drop(columns=['LSI TF-IDF Keywords','LDA TF-IDF Keywords'])
df[['text', 'Top 5 Freq Words(LSI(I))', 'Top 5 Freq Words(LDA(I))']].head(3)

Unnamed: 0,text,Top 5 Freq Words(LSI(I)),Top 5 Freq Words(LDA(I))
0,tv future in the hands of viewers with home th...,"mobile, phone, film, award, england","blair, bank, election, party, sale"
1,worldcom boss left books alone former worldc...,"film, award, best, oscar, england","blair, bank, election, party, sale"
2,tigers wary of farrell gamble leicester say ...,"film, award, best, oscar, england","mobile, phone, search, game, award"


#### 2 ) With term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents (drawing from Zipf's Law)

In [90]:
my_dict.filter_extremes(no_below=5, no_above=0.90)
len(my_dict.token2id)

7896

In [91]:
# Count Vectorization
dtm = [my_dict.doc2bow(doc) for doc in tokens]
    
# TF-IDF Vectorization
tfidf = TfidfModel(dtm)
tfidf = tfidf[dtm]

In [92]:
from gensim.models import LsiModel, LdaModel
# LSI with TF-IDF Vector
lsi_tfidf = LsiModel(corpus=tfidf, id2word=my_dict, num_topics=5)
# LDA with TF-IDF Vector
lda_tfidf = LdaModel(corpus=tfidf, id2word=my_dict, num_topics=5)

In [93]:
df['LSI TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lsi_tfidf, corpus=tfidf, texts=df.text)
df['LDA TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lda_tfidf, corpus=tfidf, texts=df.text)

df.head(3)

Unnamed: 0,category,text,Top 5 Freq Words(LSI(I)),Top 5 Freq Words(LDA(I)),LSI TF-IDF Keywords,LDA TF-IDF Keywords
0,tech,tv future in the hands of viewers with home th...,"mobile, phone, film, award, england","blair, bank, election, party, sale","mobile, phone, economy, growth, rate, bank, fi...","mobile, phone, search, sale, blair, election, ..."
1,business,worldcom boss left books alone former worldc...,"film, award, best, oscar, england","blair, bank, election, party, sale","labour, election, blair, brown, tory, party, t...","mobile, phone, search, sale, blair, election, ..."
2,sport,tigers wary of farrell gamble leicester say ...,"film, award, best, oscar, england","mobile, phone, search, game, award","mobile, phone, film, award, best, technology, ...","film, award, holmes, game, nomination, england..."


In [94]:
# Get 5 most common keywords across the LSI group of keywords
from collections import Counter 
for i in df.index:
    words = df.loc[i, 'LSI TF-IDF Keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words(LSI(II))'] = ','.join([word[0] for word in most_occur])

for i in df.index:
    words = df.loc[i, 'LDA TF-IDF Keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words(LDA(II))'] = ','.join([word[0] for word in most_occur]) 
    
df = df.drop(columns=['LSI TF-IDF Keywords','LDA TF-IDF Keywords'])
pd.set_option('display.max_columns', None)
df[['text', 'Top 5 Freq Words(LSI(II))', 'Top 5 Freq Words(LDA(II))']].head(3)

Unnamed: 0,text,Top 5 Freq Words(LSI(II)),Top 5 Freq Words(LDA(II))
0,tv future in the hands of viewers with home th...,"mobile, phone, economy, growth, rate","mobile, phone, search, sale, blair"
1,worldcom boss left books alone former worldc...,"labour, election, blair, brown, tory","mobile, phone, search, sale, blair"
2,tigers wary of farrell gamble leicester say ...,"mobile, phone, film, award, best","film, award, holmes, game, nomination"


## 3) With a part of speech filter, to limit your TD-IDF matrix to nouns only. 

In [33]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [95]:
# Matrix of nouns
text = ''.join(str(e) for e in tokens)
text_pos=word_tokenize(text)
pos_tag=nltk.pos_tag(text_pos)
nouns=list(filter(lambda x: x[1] == 'NN', pos_tag))
nounlist = list(nouns)

def remove_small_words(texts):
    return [[w for w in simple_preprocess(str(doc)) if len(w) > 2] for doc in texts]

tokens = remove_small_words(nounlist)
my_dict = Dictionary(tokens)

# Count Vectorization
dtm = [my_dict.doc2bow(doc) for doc in tokens]

In [96]:
# TF-IDF Vectorization
tfidf = TfidfModel(dtm)
tfidf = tfidf[dtm]

In [97]:
from gensim.models import LsiModel, LdaModel
# LSI with TF-IDF Vector
lsi_tfidf = LsiModel(corpus=tfidf, id2word=my_dict, num_topics=5)
# LDA with TF-IDF Vector
lda_tfidf = LdaModel(corpus=tfidf, id2word=my_dict, num_topics=5)

In [98]:
# Get dominant topic and corresponding keywords for each article

def getKeywordsFromDominantTopic(model, corpus, texts): 
    # Init output
    topickeyword_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topickeyword_df = topickeyword_df.append(pd.Series([topic_keywords]), ignore_index=True)
            else:
                break
    return(topickeyword_df)

# Assign the keywords for each vectorization and model combination
df['LSI TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lsi_tfidf, corpus=tfidf, texts=df.text)
df['LDA TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lda_tfidf, corpus=tfidf, texts=df.text)
df.head(3)

Unnamed: 0,category,text,Top 5 Freq Words(LSI(I)),Top 5 Freq Words(LDA(I)),Top 5 Freq Words(LSI(II)),Top 5 Freq Words(LDA(II)),LSI TF-IDF Keywords,LDA TF-IDF Keywords
0,tech,tv future in the hands of viewers with home th...,"mobile, phone, film, award, england","blair, bank, election, party, sale","mobile, phone, economy, growth, rate","mobile, phone, search, sale, blair","future, option, opinion, innovation, investiga...","future, picture, try, version, chance, commiss..."
1,business,worldcom boss left books alone former worldc...,"film, award, best, oscar, england","blair, bank, election, party, sale","labour, election, blair, brown, tory","mobile, phone, search, sale, blair","country, option, construction, ambition, admin...","country, image, decision, medium, pressure, de..."
2,sport,tigers wary of farrell gamble leicester say ...,"film, award, best, oscar, england","mobile, phone, search, game, award","mobile, phone, film, award, best","film, award, holmes, game, nomination","country, option, construction, ambition, admin...","country, image, decision, medium, pressure, de..."


In [99]:
# Get 5 most common keywords across the LSI group of keywords
from collections import Counter 
for i in df.index:
    words = df.loc[i, 'LSI TF-IDF Keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words(LSI(III))'] = ','.join([word[0] for word in most_occur])

for i in df.index:
    words = df.loc[i, 'LDA TF-IDF Keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words(LDA(III))'] = ','.join([word[0] for word in most_occur]) 
    
df[['text', 'Top 5 Freq Words(LSI(III))', 'Top 5 Freq Words(LDA(III))']].head(3)

Unnamed: 0,text,Top 5 Freq Words(LSI(III)),Top 5 Freq Words(LDA(III))
0,tv future in the hands of viewers with home th...,"future, option, opinion, innovation, investiga...","future, picture, try, version, chance"
1,worldcom boss left books alone former worldc...,"country, option, construction, ambition, admin...","country, image, decision, medium, pressure"
2,tigers wary of farrell gamble leicester say ...,"country, option, construction, ambition, admin...","country, image, decision, medium, pressure"


In [100]:
df = df.drop(columns=['LSI TF-IDF Keywords','LDA TF-IDF Keywords'])
df.head()

Unnamed: 0,category,text,Top 5 Freq Words(LSI(I)),Top 5 Freq Words(LDA(I)),Top 5 Freq Words(LSI(II)),Top 5 Freq Words(LDA(II)),Top 5 Freq Words(LSI(III)),Top 5 Freq Words(LDA(III))
0,tech,tv future in the hands of viewers with home th...,"mobile, phone, film, award, england","blair, bank, election, party, sale","mobile, phone, economy, growth, rate","mobile, phone, search, sale, blair","future, option, opinion, innovation, investiga...","future, picture, try, version, chance"
1,business,worldcom boss left books alone former worldc...,"film, award, best, oscar, england","blair, bank, election, party, sale","labour, election, blair, brown, tory","mobile, phone, search, sale, blair","country, option, construction, ambition, admin...","country, image, decision, medium, pressure"
2,sport,tigers wary of farrell gamble leicester say ...,"film, award, best, oscar, england","mobile, phone, search, game, award","mobile, phone, film, award, best","film, award, holmes, game, nomination","country, option, construction, ambition, admin...","country, image, decision, medium, pressure"
3,sport,yeading face newcastle in fa cup premiership s...,"film, award, best, oscar, england","blair, bank, election, party, sale","mobile, phone, film, award, best","ferguson, united, arsenal, blunkett, campbell","country, option, construction, ambition, admin...","country, image, decision, medium, pressure"
4,entertainment,ocean s twelve raids box office ocean s twelve...,"mobile, economy, growth, rate, phone","blair, bank, election, party, sale","film, award, oscar, england, best","film, award, holmes, game, nomination","figure, region, venture, alliance, function","future, picture, try, version, chance"


In [101]:
# Combine all the keywords
df['merged-keywords'] = df['Top 5 Freq Words(LDA(I))'] + ', ' + df['Top 5 Freq Words(LDA(II))'] + ', ' + df['Top 5 Freq Words(LDA(III))'] + ', ' + df['Top 5 Freq Words(LSI(I))'] + ', ' + df['Top 5 Freq Words(LSI(II))']+ ', ' + df['Top 5 Freq Words(LSI(III))']

# Get 5 most common keywords across all the groups of keywords
from collections import Counter 
for i in df.index:
    words = df.loc[i, 'merged-keywords']
    words = words.split(',')
    most_occur = Counter(words).most_common(5) 
    df.loc[i, 'Top 5 Freq Words'] = ','.join([word[0] for word in most_occur])

df[['text', 'Top 5 Freq Words']].head(3)

Unnamed: 0,text,Top 5 Freq Words
0,tv future in the hands of viewers with home th...,"mobile, phone, sale, future,blair"
1,worldcom boss left books alone former worldc...,"election, sale, blair, country,blair"
2,tigers wary of farrell gamble leicester say ...,"award, film, phone, game, country"


In [102]:
# Finding best method of all for given article

for i in range(len(df)):
    method=''
    y=0
    y=len(set(df['Top 5 Freq Words'][i])-set(df['Top 5 Freq Words(LSI(I))'][i]))
    method='LSI(I) Algorithm'
    if y>len(set(df['Top 5 Freq Words(LDA(I))'][i])-set(df['Top 5 Freq Words'][i])):
        method='LDA(I) Algorithm'
        y=len(set(df['Top 5 Freq Words(LDA(I))'][i])-set(df['Top 5 Freq Words'][i]))
    elif y>len(set(df['Top 5 Freq Words(LSI(II))'][i])-set(df['Top 5 Freq Words'][i])):
        method='LSI(II) Algorithm'
        y=len(set(df['Top 5 Freq Words(LSI(II))'][i])-set(df['Top 5 Freq Words'][i]))
    elif y>len(set(df['Top 5 Freq Words(LDA(II))'][i])-set(df['Top 5 Freq Words'][i])):
        method='LDA(II) Algorithm'
        y=len(set(df['Top 5 Freq Words(LDA(II))'][i])-set(df['Top 5 Freq Words'][i]))
    elif y>len(set(df['Top 5 Freq Words(LSI(III))'][i])-set(df['Top 5 Freq Words'][i])):
        method='LSI(III) Algorithm'
        y=len(set(df['Top 5 Freq Words(LSI(III))'][i])-set(df['Top 5 Freq Words'][i]))
    elif y>len(set(df['Top 5 Freq Words(LDA(III))'][i])-set(df['Top 5 Freq Words'][i])):
        method='LDA(III) Algorithm'
    df.at[i,'Best_Method']='Best model for this article is '+method


In [103]:
df = df.drop(columns=['merged-keywords','Top 5 Freq Words'])
df.head()

Unnamed: 0,category,text,Top 5 Freq Words(LSI(I)),Top 5 Freq Words(LDA(I)),Top 5 Freq Words(LSI(II)),Top 5 Freq Words(LDA(II)),Top 5 Freq Words(LSI(III)),Top 5 Freq Words(LDA(III)),Best_Method
0,tech,tv future in the hands of viewers with home th...,"mobile, phone, film, award, england","blair, bank, election, party, sale","mobile, phone, economy, growth, rate","mobile, phone, search, sale, blair","future, option, opinion, innovation, investiga...","future, picture, try, version, chance",Best model for this article is LDA(II) Algorithm
1,business,worldcom boss left books alone former worldc...,"film, award, best, oscar, england","blair, bank, election, party, sale","labour, election, blair, brown, tory","mobile, phone, search, sale, blair","country, option, construction, ambition, admin...","country, image, decision, medium, pressure",Best model for this article is LSI(II) Algorithm
2,sport,tigers wary of farrell gamble leicester say ...,"film, award, best, oscar, england","mobile, phone, search, game, award","mobile, phone, film, award, best","film, award, holmes, game, nomination","country, option, construction, ambition, admin...","country, image, decision, medium, pressure",Best model for this article is LDA(I) Algorithm
3,sport,yeading face newcastle in fa cup premiership s...,"film, award, best, oscar, england","blair, bank, election, party, sale","mobile, phone, film, award, best","ferguson, united, arsenal, blunkett, campbell","country, option, construction, ambition, admin...","country, image, decision, medium, pressure",Best model for this article is LSI(III) Algorithm
4,entertainment,ocean s twelve raids box office ocean s twelve...,"mobile, economy, growth, rate, phone","blair, bank, election, party, sale","film, award, oscar, england, best","film, award, holmes, game, nomination","figure, region, venture, alliance, function","future, picture, try, version, chance",Best model for this article is LSI(II) Algorithm


In [104]:
# Exporting the dataframe as a csv file
df.to_csv('BBC News Keywords.csv',index=False,encoding='utf-8')

In [105]:
df.groupby('Best_Method').count()

Unnamed: 0_level_0,category,text,Top 5 Freq Words(LSI(I)),Top 5 Freq Words(LDA(I)),Top 5 Freq Words(LSI(II)),Top 5 Freq Words(LDA(II)),Top 5 Freq Words(LSI(III)),Top 5 Freq Words(LDA(III))
Best_Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Best model for this article is LDA(I) Algorithm,1315,1315,1315,1315,1315,1315,1315,1315
Best model for this article is LDA(II) Algorithm,264,264,264,264,264,264,264,264
Best model for this article is LDA(III) Algorithm,4,4,4,4,4,4,4,4
Best model for this article is LSI(I) Algorithm,464,464,464,464,464,464,464,464
Best model for this article is LSI(II) Algorithm,173,173,173,173,173,173,173,173
Best model for this article is LSI(III) Algorithm,5,5,5,5,5,5,5,5



Six different combination of vectorization(Count and TF-IDF), LSI and LDA, most frequent words, and speech filter of noun. From the result we can observer that LDA(I) algorithm works best as count is maximum for same. LDA algorithm with TF-IDF vectorization is best on the dataset as the keywords from the most dominant topic are more relevant and descriptive for each article.

After LDA(I) second best LSI(I) algorithm followed by LDA(II) and LSI(II). LSI(III) and LDA(III) performed poorest.

