In [13]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tanmaybhardwaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tanmaybhardwaj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tanmaybhardwaj/nltk_data...


True

In [1]:
import pandas as pd
from gensim import corpora, models
import gensim
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import CoherenceModel
import string

In [19]:
# 1. Data Loading
data = pd.read_csv('foodstamp_submissions_allyears.csv')

In [26]:
# 2. Data Filtering
# Filter for entries where 'California' is mentioned in the 'url', 'selftext', 'permalink', or 'title' columns
filtered_data = data[data['url'].str.contains('California', case=False, na=False) |
                     data['selftext'].str.contains('California', case=False, na=False) |
                     data['permalink'].str.contains('California', case=False, na=False) |
                     data['title'].str.contains('California', case=False, na=False)]

# Remove rows where 'selftext' is '[deleted]' or '[removed]'
filtered_data = filtered_data[~filtered_data['selftext'].str.contains('\[deleted\]', case=False, na=False)]
filtered_data = filtered_data[~filtered_data['selftext'].str.contains('\[removed\]', case=False, na=False)]

In [27]:
# 3. Data Preprocessing
# Handle NaN values and ensure the data type is string
filtered_data['selftext'] = filtered_data['selftext'].fillna('')
# Remove punctuation, numbers, and special characters
filtered_data['clean_selftext'] = filtered_data['selftext'].map(lambda x: re.sub('[,\.!?]', '', x))
filtered_data['clean_selftext'] = filtered_data['clean_selftext'].map(lambda x: re.sub('\d+', '', x))
# Convert to lowercase
filtered_data['clean_selftext'] = filtered_data['clean_selftext'].map(lambda x: x.lower())


In [35]:
# Remove stop words and perform lemmatization

# Add custom stopwords
stop_words = set(stopwords.words('english'))
custom_stopwords = ["i'm","im", "i've", "im", "ive","hi","etc","would","want","get","Äôt","ampxb","thanks","i’m","got","th","irt","san"]
stop_words = stop_words.union(custom_stopwords)

exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop_words])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

#stop = set(stopwords.words('english'))
#exclude = set(string.punctuation) 
#lemma = WordNetLemmatizer()
# def clean(doc):
#     stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
#     punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
#     normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
#     return normalized

clean_text = [clean(doc).split() for doc in filtered_data['clean_selftext']]

# Creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(clean_text)

# Converting list of documents (corpus) into Document Term Matrix using the dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_text]

In [13]:
#Deciding on what value of num_topic to choose for the most optimal results
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        # Build LDA model
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15)
        model_list.append(model)
        
        # Compute Coherence Score using c_v
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=clean_text, start=2, limit=40, step=2)

# Show graph
import matplotlib.pyplot as plt

limit=40; start=2; step=2;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))


Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/tanmaybhardwaj/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
  File "/Users/tanmaybhardwaj/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
Traceback (most recent call last):
    exitcode = _main(fd, parent_sentinel)  File "<string>", line 1, in <module>

  File "/Users/tanmaybhardwaj/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    exitcode = _main(fd, parent_sentinel)
  File "/Users/tanmaybhardwaj/anaconda3/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
        self = reduction.pickle.load(from_parent)self = reduction.pickle.load(from_parent)

  File "/Users/tanmaybhardwaj/anaconda3/lib/python3.10/site-packages/gensim/__init__.py", line 11, in <module>
  File "/Users/tanmaybhardwaj/anaconda3/lib/python3.10/site-packages/gensim/_

KeyboardInterrupt: 

In [36]:
# 4. LDA Modeling
# Creating the object for LDA model using gensim library & Training LDA model on the document term matrix
ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics=8, id2word = dictionary, passes=50)


In [37]:
# 5. Result Interpretation
# Print the topics and the weights of words
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.020*"california" + 0.014*"month" + 0.009*"getting" + 0.008*"use" + 0.008*"ebt" + 0.006*"report" + 0.006*"last" + 0.006*"payment" + 0.006*"fresh" + 0.006*"benefit"')
(1, '0.015*"month" + 0.015*"california" + 0.013*"ebt" + 0.013*"food" + 0.010*"know" + 0.008*"balance" + 0.007*"item" + 0.007*"snap" + 0.007*"pay" + 0.006*"county"')
(2, '0.014*"california" + 0.012*"money" + 0.011*"month" + 0.011*"time" + 0.008*"state" + 0.008*"county" + 0.008*"didn’t" + 0.007*"working" + 0.007*"ebt" + 0.006*"apply"')
(3, '0.015*"calfresh" + 0.012*"california" + 0.011*"income" + 0.011*"benefit" + 0.010*"amount" + 0.010*"county" + 0.009*"food" + 0.009*"time" + 0.007*"report" + 0.007*"month"')
(4, '0.020*"card" + 0.017*"california" + 0.010*"benefit" + 0.010*"work" + 0.009*"time" + 0.009*"ebt" + 0.008*"received" + 0.007*"know" + 0.007*"need" + 0.007*"say"')
(5, '0.012*"calfresh" + 0.011*"california" + 0.011*"county" + 0.009*"food" + 0.008*"student" + 0.007*"rent" + 0.007*"meal" + 0.006*"go" + 0.006*"need

In [38]:
topics = ldamodel.show_topics(num_topics=-1, num_words=10, formatted=False)

for topic_num, topic_words in topics:
    print(f"Topic {topic_num}:")
    words, weights = zip(*topic_words)  # Separates the words and their corresponding weights
    topic_str = ", ".join([f"{word} ({round(weight, 3)})" for word, weight in zip(words, weights)])
    print(f"Words: {topic_str}\n")

Topic 0:
Words: california (0.019999999552965164), month (0.014000000432133675), getting (0.008999999612569809), use (0.00800000037997961), ebt (0.00800000037997961), report (0.006000000052154064), last (0.006000000052154064), payment (0.006000000052154064), fresh (0.006000000052154064), benefit (0.006000000052154064)

Topic 1:
Words: month (0.014999999664723873), california (0.014999999664723873), ebt (0.013000000268220901), food (0.013000000268220901), know (0.009999999776482582), balance (0.00800000037997961), item (0.007000000216066837), snap (0.007000000216066837), pay (0.007000000216066837), county (0.006000000052154064)

Topic 2:
Words: california (0.014000000432133675), money (0.012000000104308128), month (0.010999999940395355), time (0.010999999940395355), state (0.00800000037997961), county (0.00800000037997961), didn’t (0.00800000037997961), working (0.007000000216066837), ebt (0.007000000216066837), apply (0.006000000052154064)

Topic 3:
Words: calfresh (0.01499999966472387