In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/liberals-vs-conservatives-on-reddit-13000-posts/file_name.csv


In [5]:
# Import necessary libraries
import pandas as pd
import gensim
import spacy
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.phrases import Phrases, Phraser

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
df=pd.read_csv("/kaggle/input/liberals-vs-conservatives-on-reddit-13000-posts/file_name.csv")

In [8]:
# Define preprocessing function
def preprocess_text(Title):
    
    tokens = [token for token in simple_preprocess(Title) if len(token) > 3]
    
    bigram = Phrases(tokens, min_count=5, threshold=100)
    bigram_mod = Phraser(bigram)
    bigrams = bigram_mod[tokens]
    
    doc = nlp(" ".join(bigrams))
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return tokens    
    
    

In [9]:
# Preprocess the text data
df['tokens'] = df['Title'].apply(preprocess_text)
tokens = [token for doc in df['tokens'] for token in doc]

In [11]:
df = df[df['tokens'].apply(len)>0]

In [12]:
df['tokens']

0        [matter, look, like, language, speak, wear, re...
1                   [biden, speech, draw, million, viewer]
2                                           [state, union]
3                                    [poor, people, money]
5        [propose, bill, allow, kid, contagious, diseas...
                               ...                        
12849    [paul, spirit, defense, wikileak, free, inform...
12850    [anarcho, capitalism, opinion, doctrinal, syst...
12851    [mise, wiki, wiki, project, dedicate, advancem...
12852    [fireman, protection, monopoly, fail, capitalism]
12853                    [wikipedia, article, well, write]
Name: tokens, Length: 12758, dtype: object

In [13]:
df['Title']

0        No matter who someone is, how they look like, ...
1          Biden speech draws 38.2 million U.S. TV viewers
2                                       State of the union
3                    We Should Just Give Poor People Money
5        Proposed Bill Would Allow Kids With Contagious...
                               ...                        
12849    Ron Paul’s Spirited Defense of WikiLeaks & Fre...
12850    “Anarcho-capitalism, in my opinion, is a doctr...
12851    Mises Wiki is a wiki project dedicated to the ...
12852    Fireman Protection Monopoly - Is This Failed C...
12853        Can this Wikipedia Article be Better Written?
Name: Title, Length: 12758, dtype: object

In [15]:
# Create the dictionary and corpus
dictionary = Dictionary([tokens])
print(len(dictionary))

corpus = [dictionary.doc2bow(token) for token in df['tokens'] if len(token)>0]

11516


In [73]:
# Set LDA parameters
num_topics = 8
passes = 10
iterations = 100
random_state = 42

In [74]:
# Fit the LDA model to the corpus
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, 
                     passes=passes, iterations=iterations, random_state=random_state)

In [75]:
# Print the main topics being discussed
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"trump" + 0.017*"desantis" + 0.013*"court" + 0.012*"time" + 0.010*"florida" + 0.009*"business" + 0.009*"truth" + 0.009*"biden" + 0.008*"deal" + 0.008*"anarchism"
Topic: 1 
Words: 0.026*"capitalism" + 0.021*"free" + 0.018*"market" + 0.016*"ancap" + 0.013*"work" + 0.011*"government" + 0.011*"people" + 0.009*"inflation" + 0.008*"speech" + 0.008*"reagan"
Topic: 2 
Words: 0.016*"libertarian" + 0.015*"question" + 0.014*"private" + 0.011*"post" + 0.011*"freedom" + 0.009*"system" + 0.009*"trudeau" + 0.009*"canada" + 0.008*"protest" + 0.007*"right"
Topic: 3 
Words: 0.021*"trump" + 0.018*"biden" + 0.009*"year" + 0.009*"covid" + 0.008*"say" + 0.008*"public" + 0.008*"bitcoin" + 0.008*"police" + 0.006*"woman" + 0.006*"life"
Topic: 4 
Words: 0.014*"capitalist" + 0.012*"right" + 0.012*"communist" + 0.011*"anti" + 0.008*"real" + 0.008*"trump" + 0.008*"society" + 0.007*"anarchist" + 0.007*"party" + 0.007*"create"
Topic: 5 
Words: 0.009*"money" + 0.008*"property" + 0.008*"worker" 

In [40]:
from gensim.models import CoherenceModel

# Calculate coherence score using c_v coherence measure
coherence_model = CoherenceModel(model=lda_model, texts=df['tokens'], corpus=corpus, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score:", coherence_score)


Coherence Score: 0.3854159592071862


In [41]:
# Calculate perplexity
perplexity = lda_model.log_perplexity(corpus)
print("Perplexity:", perplexity)

Perplexity: -8.671087045612792


In [42]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors


# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list])
#print (topic_weights)

In [44]:
# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

In [45]:
# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)


In [46]:
# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 12758 samples in 0.026s...
[t-SNE] Computed neighbors for 12758 samples in 1.164s...
[t-SNE] Computed conditional probabilities for sample 1000 / 12758
[t-SNE] Computed conditional probabilities for sample 2000 / 12758
[t-SNE] Computed conditional probabilities for sample 3000 / 12758
[t-SNE] Computed conditional probabilities for sample 4000 / 12758
[t-SNE] Computed conditional probabilities for sample 5000 / 12758
[t-SNE] Computed conditional probabilities for sample 6000 / 12758
[t-SNE] Computed conditional probabilities for sample 7000 / 12758
[t-SNE] Computed conditional probabilities for sample 8000 / 12758
[t-SNE] Computed conditional probabilities for sample 9000 / 12758
[t-SNE] Computed conditional probabilities for sample 10000 / 12758
[t-SNE] Computed conditional probabilities for sample 11000 / 12758
[t-SNE] Computed conditional probabilities for sample 12000 / 12758
[t-SNE] Computed conditional probabilities for sam



[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.225845
[t-SNE] KL divergence after 1000 iterations: 1.237905


In [58]:
output_notebook()
n_topics = 8
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=1000, plot_height=800)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)