In [6]:
import pandas as pd
pd.options.display.max_columns = 200
pd.options.mode.chained_assignment = None

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from string import punctuation

from collections import Counter
import re
import numpy as np

from tqdm import tqdm_notebook
tqdm_notebook().pandas()

import matplotlib.pyplot as plt

data = pd.read_csv('./data-model/news.csv')
data = data.drop_duplicates('description')
data = data[~data['description'].isnull()]
data = data[(data.description.map(len) > 140) & (data.description.map(len) <= 300)]
data = data.sample(100, random_state=42)
data.reset_index(inplace=True, drop=True)

stop_words = []

f = open('./data-model/stopwords.txt', 'r')
for l in f.readlines():
    stop_words.append(l.replace('\n', ''))
    
additional_stop_words = ['t', 'will']
stop_words += additional_stop_words

def _removeNonAscii(s): 
    return "".join(i for i in s if ord(i)<128)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = text.replace('(ap)', '')
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    text = re.sub('[^a-zA-Z ?!]+', '', text)
    text = _removeNonAscii(text)
    text = text.strip()
    return text

def tokenizer(text):
    text = clean_text(text)    
    tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
    tokens = list(reduce(lambda x,y: x+y, tokens))
    tokens = list(filter(lambda token: token not in (stop_words + list(punctuation)) , tokens))
    return tokens

def reduce(function, iterable, initializer=None):
    it = iter(iterable)
    if initializer is None:
        value = next(it)
    else:
        value = initializer
    for element in it:
        value = function(value, element)
    return value

data['description'] = data['description'].map(lambda d: str(d))
data['tokens'] = data['description'].progress_map(lambda d: tokenizer(d))

def keywords(category):
    tokens = data[data['category'] == category]['tokens']
    alltokens = []
    for token_list in tokens:
        alltokens += token_list
    counter = Counter(alltokens)
    return counter.most_common(10)

# warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim
import gensim.corpora as corpora
from gensim import matutils
from gensim.models import CoherenceModel

aux = data.copy()

bigram = gensim.models.Phrases(aux['tokens'], min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
aux['tokens_bigram'] = aux['tokens'].progress_map(lambda tokens: bigram_mod[tokens])

id2word = corpora.Dictionary(aux['tokens_bigram'])
texts = aux['tokens_bigram'].values
corpus = [id2word.doc2bow(text) for text in texts]

def LDA_model(num_topics, passes=1):
    return gensim.models.ldamodel.LdaModel(corpus=tqdm_notebook(corpus, leave=False),
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               eval_every=10,
                                               chunksize=2000,
                                               passes=passes,
                                               per_word_topics=True
                                            )

def compute_coherence(model):
    coherence = CoherenceModel(model=model, 
                           texts=aux['tokens_bigram'].values,
                           dictionary=id2word, coherence='c_v')
    return coherence.get_coherence()

def display_topics(model):
    topics = model.show_topics(num_topics=model.num_topics, formatted=False, num_words=10)
    topics = map(lambda c: map(lambda cc: cc[0], c[1]), topics)
    df = pd.DataFrame(topics)
    df.index = ['topic_{0}'.format(i) for i in range(model.num_topics)]
    df.columns = ['keyword_{0}'.format(i) for i in range(1, 10+1)]
    return df

def explore_models(df, rg=range(5, 25)):
    id2word = corpora.Dictionary(df['tokens_bigram'])
    texts = df['tokens_bigram'].values
    corpus = [id2word.doc2bow(text) for text in texts]

    models = []
    coherences = []
    
    for num_topics in tqdm_notebook(rg, leave=False):
        lda_model = LDA_model(num_topics, passes=5)
        models.append(lda_model)
        coherence = compute_coherence(lda_model)
        coherences.append(coherence)
      

    fig = plt.figure(figsize=(15, 5))
    plt.title('Choosing the optimal number of topics')
    plt.xlabel('Number of topics')
    plt.ylabel('Coherence')
    plt.grid(True)
    plt.plot(rg, coherences)
    
    return coherences, models

best_model = LDA_model(num_topics=50, passes=5)

dff = display_topics(model=best_model)
from IPython.display import display, HTML

display(dff)

display(aux)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))




HBox(children=(IntProgress(value=0), HTML(value='')))

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,keyword_9,keyword_10
topic_0,carbon,friday,states,change,efforts,climate,dioxide,achieve,address,microsoft
topic_1,president,impeachment,senate,trial,op,stand,articles,trump,wall,vice
topic_2,australian,jones,hope,wildlife,charity,weeks,defensive,studios,field,charities
topic_3,economy,boost,tariffs,china,beijing,olcott,analyst,stabilize,reduced,confidence
topic_4,india,thursday,national,friday,term,south,monarchy,enters,woman,delivered
topic_5,donald,job,illegally,state,ambassador,ukraine,emerged,maintained,trump,pompeo
topic_6,uber,gaming,elevate,air,vertical,complete,picked,landing,ces,best
topic_7,points,pearson,actions,zone,watford,side,relegation,reborn,ayatollah,point
topic_8,china,windows,microsoft,chromium,edge,reilly,industry,investor,transportation,introduce
topic_9,satellite,quiet,polemic,india,unrivalled,champion,shared,gsat,reaction,public


Unnamed: 0,author,title,description,url,urlToImage,publishedAt,source,category,scraping_date,tokens,tokens_bigram
0,Manish Singh,Xiaomi spins off POCO as an independent brand,"Xiaomi said today it is spinning off POCO, a s...",https://techcrunch.com/2020/01/17/xiaomi-spins...,https://techcrunch.com/wp-content/uploads/2020...,2020-01-17T08:05:15Z,techcrunch,technology,2020-01-17 16:50:20.096355,"[xiaomi, today, spinning, poco, brand, created...","[xiaomi, today, spinning, poco, brand, created..."
1,JILL COLVIN,Trump campaign tries robust outreach to expand...,WASHINGTON (AP) — Selfies on a “Women for Trum...,https://apnews.com/93e950de2526338e53df35e2f16...,https://storage.googleapis.com/afs-prod/media/...,2020-01-17T05:53:03Z,associated-press,general,2020-01-17 16:50:20.096355,"[washington, selfies, women, trump, bus, tour,...","[washington, selfies, women, trump, bus, tour,..."
2,Football Italia Staff,Bonucci discussed return with ultras,Messages between Leonardo Bonucci and a leadin...,http://www.football-italia.net/148837/bonucci-...,https://www.football-italia.net/sites/default/...,2020-01-17T01:00:00Z,football-italia,sports,2020-01-17 16:50:20.096355,"[messages, leonardo, bonucci, leading, juventu...","[messages, leonardo, bonucci, leading, juventu..."
3,Michael McWhertor,"Overwatch Lunar New Year 2020 skins, dates, an...",The Year of the Rat begins in Overwatch with L...,https://www.polygon.com/2020/1/16/21064247/ove...,https://cdn.vox-cdn.com/thumbor/f4oLIQpibiTgwj...,2020-01-16T19:10:00Z,polygon,entertainment,2020-01-17 16:50:20.096355,"[year, rat, overwatch, lunar, year, blizzard, ...","[year, rat, overwatch, lunar, year, blizzard, ..."
4,BBC Sport,Australian Open 2020: Margaret Court to be 're...,"Margaret Court was a quiet champion, unrivalle...",https://www.bbc.co.uk/sport/tennis/51105556,https://ichef.bbci.co.uk/onesport/cps/624/cpsp...,2020-01-17T00:01:59Z,bbc-sport,sports,2020-01-17 16:50:20.096355,"[margaret, court, quiet, champion, unrivalled,...","[margaret, court, quiet, champion, unrivalled,..."
...,...,...,...,...,...,...,...,...,...,...,...
95,"KATE BRUMBACK, DEEPTI HAJELA and AMY TAXIN","AP visits immigration courts across US, finds ...","LUMPKIN, Ga. (AP) — In a locked, guarded court...",https://apnews.com/7851364613cf0afbf67cf793094...,,2020-01-16T15:37:53Z,associated-press,general,2020-01-17 16:50:20.096355,"[lumpkin, ga, locked, guarded, courtroom, comp...","[lumpkin, ga, locked, guarded, courtroom, comp..."
96,David Brennan,Iran Plane Investigation Will Draw on Lessons ...,Representatives for the multinational Internat...,https://www.newsweek.com/iran-plane-investigat...,https://d.newsweek.com/en/full/1560705/iran-fl...,2020-01-16T17:01:18Z,newsweek,general,2020-01-17 16:50:20.096355,"[representatives, multinational, international...","[representatives, multinational, international..."
97,Hayley Peterson,Amazon contractor Pinnacle Logistics to lay of...,Texas-based Pinnacle Logistics plans to lay of...,http://uk.businessinsider.com/amazon-prime-con...,https://i.insider.com/5e20ceb8b6d52d5a0c040772...,2020-01-16T21:54:07Z,business-insider-uk,business,2020-01-17 16:50:20.096355,"[texas, based, pinnacle, logistics, plans, lay...","[texas, based, pinnacle, logistics, plans, lay..."
98,LISA MASCARO,Trump's trial begins at the start of an electi...,WASHINGTON (AP) — The U.S. Senate opened the i...,https://apnews.com/e2ca12d7bcee4a01ec1d10291b0...,https://storage.googleapis.com/afs-prod/media/...,2020-01-17T05:09:28Z,associated-press,general,2020-01-17 16:50:20.096355,"[washington, senate, opened, impeachment, tria...","[washington, senate, opened, impeachment, tria..."
