In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
from pprint import pprint
from gensim.corpora import MmCorpus

In [34]:
stop_words = stopwords.words('english')
stop_words.append("said")
bbc_df = pd.read_csv("../data/bbc-text.csv")
print(bbc_df)

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


In [35]:
def clean_text(input_string):
    input_string = re.sub(r'[^\w\s]', ' ', input_string)
    input_string = re.sub(r'\d', '', input_string)
    input_list = simple_preprocess(input_string)
    input_list = [word for word in input_list if word not in stop_words]
    return input_list

In [36]:
bbc_df['text'] = bbc_df['text'].apply(lambda x: clean_text(x))
print(bbc_df)

           category                                               text
0              tech  [tv, future, hands, viewers, home, theatre, sy...
1          business  [worldcom, boss, left, books, alone, former, w...
2             sport  [tigers, wary, farrell, gamble, leicester, say...
3             sport  [yeading, face, newcastle, fa, cup, premiershi...
4     entertainment  [ocean, twelve, raids, box, office, ocean, twe...
...             ...                                                ...
2220       business  [cars, pull, us, retail, figures, us, retail, ...
2221       politics  [kilroy, unveils, immigration, policy, ex, cha...
2222  entertainment  [rem, announce, new, glasgow, concert, us, ban...
2223       politics  [political, squabbles, snowball, become, commo...
2224          sport  [souness, delight, euro, progress, boss, graem...

[2225 rows x 2 columns]


In [37]:
texts = bbc_df['text'].values
id_dict = corpora.Dictionary(texts)
corpus = [id_dict.doc2bow(text) for text in texts]

In [40]:
num_topics = 5
lda_model = LdaModel(corpus=corpus,
                     id2word=id_dict,
                     num_topics=num_topics, 
                     chunksize=100,
                     passes=20)

In [41]:
pprint(lda_model.print_topics())

[(0,
  '0.010*"people" + 0.006*"mobile" + 0.005*"one" + 0.005*"new" + '
  '0.005*"technology" + 0.004*"also" + 0.004*"use" + 0.004*"net" + '
  '0.004*"many" + 0.004*"digital"'),
 (1,
  '0.007*"game" + 0.006*"first" + 0.005*"time" + 0.005*"year" + '
  '0.005*"england" + 0.005*"world" + 0.005*"win" + 0.005*"last" + '
  '0.005*"players" + 0.005*"one"'),
 (2,
  '0.013*"bn" + 0.013*"us" + 0.011*"year" + 0.009*"sales" + 0.006*"market" + '
  '0.006*"company" + 0.005*"growth" + 0.005*"bank" + 0.005*"firm" + '
  '0.005*"also"'),
 (3,
  '0.009*"film" + 0.009*"blair" + 0.008*"party" + 0.008*"best" + 0.006*"also" '
  '+ 0.006*"one" + 0.005*"year" + 0.005*"show" + 0.004*"new" + 0.004*"us"'),
 (4,
  '0.020*"mr" + 0.013*"would" + 0.008*"government" + 0.007*"people" + '
  '0.006*"labour" + 0.005*"could" + 0.005*"election" + 0.005*"minister" + '
  '0.004*"told" + 0.004*"also"')]


In [42]:
def save_model(lda, lda_path, id_dict, dict_path, corpus, corpus_path):
    lda.save(lda_path)
    id_dict.save(dict_path)
    MmCorpus.serialize(corpus_path, corpus)

In [43]:
model_path = "../models/bbc_gensim/lda.model"
dict_path = "../models/bbc_gensim/id2word.dict"
corpus_path = "../models/bbc_gensim/corpus.mm"
save_model(lda_model, model_path, id_dict, dict_path, corpus, corpus_path)

In [44]:
lda_model = LdaModel.load(model_path)
id_dict = corpora.Dictionary.load(dict_path)

In [46]:
new_example = """Manchester United players slumped to the turf 
at full-time in Germany on Tuesday in acknowledgement of what their 
latest pedestrian first-half display had cost them. The 3-2 loss at 
RB Leipzig means United will not be one of the 16 teams in the draw 
for the knockout stages of the Champions League. And this is not the 
only price for failure. The damage will be felt in the accounts, in 
the dealings they have with current and potentially future players 
and in the faith the fans have placed in manager Ole Gunnar Solskjaer. 
With Paul Pogba's agent angling for a move for his client and ex-United 
defender Phil Neville speaking of a "witchhunt" against his former team-mate 
Solskjaer, BBC Sport looks at the ramifications and reaction to a big loss for United."""

In [47]:
input_list = clean_text(new_example)
bow = id_dict.doc2bow(input_list)
topics = lda_model[bow]
print(topics)

[(1, 0.7338447), (2, 0.15261793), (4, 0.1073401)]
