## Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pyLDAvis
import gensim
from pprint import pprint
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
import warnings
warnings.filterwarnings("ignore")

  from scipy.sparse.base import spmatrix
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.linalg.special_matrices import triu


## Read the news article csv file 

In [2]:
df = pd.read_csv('BBC News Train.csv')
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [3]:
# random article
df['Text'][90], df['Category'][90]

 'politics')

In [4]:
df['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

###  Tokenize, clean and lowercase the documents using gensim.utils.simple_preprocess

In [5]:
data = df['Text'].to_list()

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc = True)

In [7]:
data_words = list(sent_to_words(data))

### Lemmatize the documents to reduce the total number of unique words in the dictionary

In [8]:
nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])

def lemmatize(texts, allowed_pos = ['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(' '.join(sent))
        texts_out.append(' '.join([token.lemma_ for token in doc if token.pos_ in allowed_pos]))
    return texts_out

In [9]:
data_lemmatized = lemmatize(data_words)

### Create document term matrix using sklearn's  CountVectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase = True, stop_words = 'english', min_df = 5, token_pattern = '[a-zA-Z0-9]{3,}')

data_vectorized = vectorizer.fit_transform(data_lemmatized)

### Build LDA model with sklearn 

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 5, learning_method = 'online', max_iter = 10, batch_size = 128, 
                                      n_jobs = -1, random_state = 64)

lda_output = lda_model.fit_transform(data_vectorized)

In [13]:
pd.DataFrame(lda_output).head()

Unnamed: 0,0,1,2,3,4
0,0.6633,0.019783,0.001666,0.313592,0.00166
1,0.994099,0.001471,0.001479,0.001476,0.001475
2,0.641186,0.001072,0.001078,0.00109,0.355573
3,0.000747,0.000752,0.000748,0.997008,0.000745
4,0.86909,0.001416,0.020906,0.038107,0.070481


In [14]:
# We know before hand that the dataset contains 5 categories. Let's check the Log Likelihood, Perplexity and Coherence Score for different
# values of n_components and select number of topics for LDA accordingly.

# Coherence score for a sklearn LatentDirichletAllocation model can be found out using tmtoolkit library.

metrics = pd.DataFrame(columns = ['n_components', 'Log Likelihood', 'Perplexity', 'Coherence Score'])

for i in range(3, 8):
    lda_model = LatentDirichletAllocation(n_components = i, learning_method = 'online', max_iter = 10, batch_size = 128, 
                                          n_jobs = -1, random_state = 64)
    lda_model.fit_transform(data_vectorized)
    c_v = metric_coherence_gensim(measure = 'c_v',
                                  top_n = 10,
                                  topic_word_distrib = lda_model.components_,
                                  dtm = data_vectorized,
                                  vocab = np.array(vectorizer.get_feature_names()),
                                  texts = [doc.split() for doc in data_lemmatized],
                                  return_mean = False)
    c_v_text = f'{np.round(np.mean(c_v), 4)} ± {np.round(np.std(c_v), 4)}'
    metrics.loc[len(metrics)] = [i, np.round(lda_model.score(data_vectorized), 2), np.round(lda_model.perplexity(data_vectorized), 2), 
                                 c_v_text]
    
metrics

Unnamed: 0,n_components,Log Likelihood,Perplexity,Coherence Score
0,3,-1591501.7,1481.73,0.4122 ± 0.0788
1,4,-1578413.04,1395.38,0.4683 ± 0.0827
2,5,-1564565.07,1309.5,0.5497 ± 0.0787
3,6,-1580467.74,1408.6,0.5148 ± 0.1292
4,7,-1563982.13,1306.0,0.5312 ± 0.1291


In [15]:
best_lda = LatentDirichletAllocation(n_components = 5, learning_method = 'online', max_iter = 10, batch_size = 128, 
                                     n_jobs = -1, random_state = 64)

In [16]:
lda_output = best_lda.fit_transform(data_vectorized)

### Display topics distribution for documents in the dataframe and also the dominant topic 

In [17]:
topics = pd.DataFrame(lda_output, columns = ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4'])
topics['Dominant Topic'] = np.argmax(topics.values, axis = 1)
topics.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Dominant Topic
0,0.6633,0.019783,0.001666,0.313592,0.00166,0
1,0.994099,0.001471,0.001479,0.001476,0.001475,0
2,0.641186,0.001072,0.001078,0.00109,0.355573,0
3,0.000747,0.000752,0.000748,0.997008,0.000745,3
4,0.86909,0.001416,0.020906,0.038107,0.070481,0
5,0.002234,0.002203,0.887216,0.017022,0.091326,2
6,0.001774,0.001772,0.992893,0.001774,0.001786,2
7,0.082232,0.545914,0.127662,0.002578,0.241615,1
8,0.995911,0.001019,0.001022,0.001029,0.001019,0
9,0.13152,0.861627,0.002277,0.002277,0.002299,1


In [18]:
topics['Dominant Topic'].value_counts()

2    375
0    342
4    268
3    258
1    247
Name: Dominant Topic, dtype: int64

### Let's check the top 20 words for each topic 

In [19]:
def show_topics(vectorizer, lda_model, n_words = 20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return np.array(topic_keywords)

In [20]:
pd.DataFrame(show_topics(vectorizer, best_lda, n_words = 20).T, columns = ['Topic0', 'Topic1', 'Topic2', 'Topic3', 'Topic4'])

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4
0,say,film,say,say,say
1,year,good,win,use,government
2,company,award,game,people,election
3,market,year,year,mobile,party
4,firm,say,play,phone,people
5,rise,star,time,make,labour
6,sale,win,make,technology,plan
7,growth,include,good,service,tory
8,economy,music,player,new,law
9,share,actor,come,year,public


### Visualize the topic model with  pyLDAvis

In [None]:
from pyLDAvis import sklearn as LDAsklearn
pyLDAvis.enable_notebook()
panel = LDAsklearn.prepare(best_lda, data_vectorized, vectorizer, mds = 'tsne')
panel

## News article recommendation using topic modelling 

In [23]:
# Function to get topc distribution of a given text using the fitted LDA model

def text_2_topics(text):
    text = list(sent_to_words([text]))
    lemmatized = lemmatize(text)
    vectorized = vectorizer.transform(lemmatized)
    lda_output = best_lda.transform(vectorized)
    return lda_output

In [45]:
# fit NearestNeighbors on the LDA ouptput to create a vector database

from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors = 5, n_jobs = -1)
neighbors.fit(lda_output)

In [88]:
# Function to summarize document into 3 sentences using sumy package. 

def summarize(para, sentence_count = 3): 
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.summarizers.lsa import LsaSummarizer as Summarizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    LANGUAGE = 'english'
    SENTENCES_COUNT = sentence_count

    parser = PlaintextParser.from_string(para, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(str(sentence).capitalize())

In [100]:
# Function to check recommendations to a random news article

def random_example():    
    i = np.random.randint(1, len(df))
    summarize(df.loc[i, 'Text'], sentence_count = 3)
    print(df['Category'][i], end = '\n\n\n')
    print('\t\t\t\t\t\t\t\t News Recommendations', end = '\n\n')

    distances, indices = neighbors.kneighbors(text_2_topics(df['Text'][i])[0].reshape(1, -1))

    for ind in indices[0][1:]:
        summarize(df.loc[ind, 'Text'], sentence_count = 3)
        print('--------------------------------------------------', end = '\n\n')

In [103]:
random_example()

Following the fascination with the writing of salam pax - not his real name - he began a regular column in the guardian newspaper and was given a crash course in documentary film-making.
For the film he travelled iraq to document the changing landscape of the country and the problems it has faced since the invasion  speaking to ordinary iraqis about their experiences.
Rasheed said the title was refers to the isolation felt by iraqis under saddam s regime and the difficult time the country is now experiencing.
entertainment


								 News Recommendations

Sizzla  whose real name is miguel collins  has released 25 albums since 1995 and is credited with taking dancehall music back to its reggae origins.
Sizzla s uk tour was cancelled after scotland yard s racial and violent crime taskforce announced it was examining lyrics by eight reggae artists.
--------------------------------------------------

Gibson and his icon productions partner bruce davey said they would not be campaigning in 