# Topic analysis using LDA

This notebook aims to discover which topics were the most discussed in the Cricket Subreddit. For this, we'll use a popular technique called LDA (Latent Dirichlet Allocation)

First we'll take a look on the structure of the dataset

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import numpy as np
import gensim
import nltk
import pandas as pd
import pyLDAvis.gensim_models
import re
import seaborn

from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from pprint import pprint

%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/cricket-on-reddit/reddit_cricket.csv')

In [None]:
df.head()

In [None]:
df['body'] = df['body'].astype(str)

now we'll remove all punctuation and set all letters to lower case

In [None]:
# Load the regular expression library
# Remove punctuation
df['body_processed'] = df['body'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
df['body_processed'] = df['body_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
df['body_processed'].head()

To see which words most appear in the dataset, we'll plot a WordCloud with them.

In [None]:
long_string = ','.join(list(df['body_processed'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a; word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
plt.figure( figsize=(15,10) )
plt.imshow(wordcloud)
plt.show()

Now we'll remove the stopwords, remove words with len < 5 and remove other words that may mess our analysis (that we saw in the word cloud), like 'https' and tokenize the words.

In [None]:

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

en_stop.update(['nan','https', 'http','wwwredditcom', 'match_thread_', 'comments', 'threads', 'wwwredit'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in en_stop] for doc in texts]

def filter_size(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if len(word) > 4 ] for doc in texts]

data = df['body_processed'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
data_words = filter_size(data_words)
print(data_words[:1][0][:30])

Next, we convert the tokenized object into a corpus and dictionary.

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

Now we will build a model with 10 topics where each topic is a combination of keywords, and each keyword contributes a certain weightage to the topic.

In [None]:
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

Now that we have a trained model let’s visualize the topics for interpretability

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_display)

And that's it! Thank you for checking this notebook. It was largely inspired by these two tutorials:
1. https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
2. https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0