## Topic Modeling (LDA) ##

### LDA :- Latent Dirichlet Allocation Model/Algorithm ###

#### By Vedant Lotia ####

In [1]:
import gensim
import wikipedia
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
basketball = wikipedia.page('Basketball')
politics = wikipedia.page('Politics')
astronomy = wikipedia.page('Astronomy')
cricket = wikipedia.page('Cricket')
animals = wikipedia.page('Animals')
biology = wikipedia.page('Biology')
neuroscience = wikipedia.page('Neuroscience')
film = wikipedia.page('Film')
money = wikipedia.page('Money')


In [3]:
film.content

'A film, also called a movie, motion picture or moving picture, is a work of visual art that simulates experiences and otherwise communicates ideas, stories, perceptions, feelings, beauty, or atmosphere through the use of moving images. These images are generally accompanied by sound, and more rarely, other sensory stimulations. The word "cinema", short for cinematography, is often used to refer to filmmaking and the film industry, and to the art form that is the result of it.\n\n\n== Recording and transmission of film ==\nThe moving images of a film are created by photographing actual scenes with a motion-picture camera, by photographing drawings or miniature models using traditional animation techniques, by means of CGI and computer animation, or by a combination of some or all of these techniques, and other visual effects.\nBefore the introduction of digital production, series of still images were recorded on a strip of chemically sensitized celluloid (photographic film stock), usua

In [4]:
corpus = [basketball.content,politics.content,astronomy.content,cricket.content,animals.content,biology.content,neuroscience.content,film.content,money.content]
len(corpus)


9

## Pre-processing ##

In [5]:
import re
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()

def preprocess_text(document):
    
    document = re.sub(r'\W',' ', str(document)) ## Removing Special Characters ##
    
    document = re.sub(r'\s+[a-zA-Z]\s+',' ', document) ## Removing all single Characters ##
    
    document = re.sub(r'\^[a-zA-Z]\s+',' ', document) ## Removing single Characters from start ##
    
    document = re.sub(r'\s+',' ',document, flags=re.I) ## Removing Special CharactersSubstituting multiple space with single space ##
     
    document = re.sub(r'^b\s',' ', document) ## Removing prefixed 'b' ##
    
    document = document.lower() ## Converting to lower cases ##
    
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 5]
    
    return tokens

processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
processed_data[0:100]

[['baseball',
  'played',
  'opposing',
  'player',
  'batting',
  'fielding',
  'proceeds',
  'player',
  'fielding',
  'called',
  'pitcher',
  'player',
  'batting',
  'objective',
  'offensive',
  'batting',
  'allowing',
  'player',
  'advance',
  'counter',
  'clockwise',
  'around',
  'called',
  'objective',
  'defensive',
  'fielding',
  'prevent',
  'batter',
  'becoming',
  'runner',
  'prevent',
  'runner',
  'advance',
  'around',
  'scored',
  'runner',
  'legally',
  'advance',
  'around',
  'player',
  'started',
  'batter',
  'winner',
  'objective',
  'batting',
  'player',
  'safely',
  'player',
  'batting',
  'without',
  'called',
  'attempt',
  'advance',
  'subsequent',
  'runner',
  'either',
  'immediately',
  'teammate',
  'batting',
  'fielding',
  'prevent',
  'getting',
  'batter',
  'runner',
  'pitcher',
  'fielder',
  'method',
  'getting',
  'batting',
  'player',
  'opposing',
  'switch',
  'batting',
  'fielding',
  'batting',
  'fielding',
  'record

In [7]:
from gensim import corpora

In [8]:
input_dict = corpora.Dictionary(processed_data)
input_corpus = [input_dict.doc2bow(token, allow_update = True) for token in processed_data]
input_corpus[0:10]

[[(0, 1),
  (1, 1),
  (2, 5),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 3),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 3),
  (23, 1),
  (24, 15),
  (25, 2),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 4),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 4),
  (40, 3),
  (41, 1),
  (42, 3),
  (43, 1),
  (44, 2),
  (45, 2),
  (46, 1),
  (47, 4),
  (48, 7),
  (49, 30),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 4),
  (54, 1),
  (55, 5),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 6),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 14),
  (65, 3),
  (66, 1),
  (67, 3),
  (68, 1),
  (69, 1),
  (70, 4),
  (71, 1),
  (72, 4),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 16),
  (78, 2),
  (79, 3),
  (80, 12),
  (81, 1),
  (82, 4),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 2),
  (88, 2),
  (89, 1),
  (90, 12),
  (

In [9]:
lda_model = gensim.models.ldamodel.LdaModel(input_corpus,num_topics=5,id2word=input_dict,passes=25)

In [10]:
topics = lda_model.print_topics(num_words=10)
topics

[(0,
  '0.016*"animal" + 0.012*"astronomy" + 0.007*"geology" + 0.007*"geological" + 0.006*"galaxy" + 0.006*"planet" + 0.004*"object" + 0.004*"system" + 0.004*"observation" + 0.004*"formation"'),
 (1,
  '0.000*"animal" + 0.000*"system" + 0.000*"astronomy" + 0.000*"currency" + 0.000*"include" + 0.000*"medium" + 0.000*"government" + 0.000*"called" + 0.000*"theory" + 0.000*"geological"'),
 (2,
  '0.000*"political" + 0.000*"animal" + 0.000*"baseball" + 0.000*"politics" + 0.000*"system" + 0.000*"cricket" + 0.000*"player" + 0.000*"include" + 0.000*"league" + 0.000*"batter"'),
 (3,
  '0.016*"political" + 0.010*"system" + 0.008*"neuroscience" + 0.007*"politics" + 0.006*"society" + 0.006*"production" + 0.005*"neuron" + 0.004*"theory" + 0.004*"picture" + 0.004*"example"'),
 (4,
  '0.019*"baseball" + 0.017*"cricket" + 0.015*"batter" + 0.012*"player" + 0.012*"league" + 0.008*"inning" + 0.006*"wicket" + 0.006*"batting" + 0.006*"played" + 0.006*"pitcher"')]

In [13]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, input_corpus, dictionary=lda_model.id2word)
vis

  default_term_info = default_term_info.sort_values(


In [12]:
text = 'Every player will have their one bad game of a season. '
text = preprocess_text(text)
text = input_dict.doc2bow(text)
lda_model.get_document_topics(textt

[(0, 0.06682508),
 (1, 0.066673234),
 (2, 0.06667328),
 (3, 0.06667018),
 (4, 0.73315823)]