In [13]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tanmaybhardwaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tanmaybhardwaj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tanmaybhardwaj/nltk_data...


True

In [14]:
import pandas as pd
from gensim import corpora, models
import gensim
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [34]:
# 1. Data Loading
data = pd.read_csv('foodstamp_submissions_allyears.csv')

In [35]:
# 2. Data Filtering
# Filter for entries where 'California' is mentioned in the 'url', 'selftext', 'permalink', or 'title' columns
filtered_data = data[data['url'].str.contains('California', case=False, na=False) |
                     data['selftext'].str.contains('California', case=False, na=False) |
                     data['permalink'].str.contains('California', case=False, na=False) |
                     data['title'].str.contains('California', case=False, na=False)]

# Remove rows where 'selftext' is '[deleted]' or '[removed]'
filtered_data = filtered_data[~filtered_data['selftext'].str.contains('\[deleted\]', case=False, na=False)]
filtered_data = filtered_data[~filtered_data['selftext'].str.contains('\[removed\]', case=False, na=False)]

In [36]:
# 3. Data Preprocessing
# Handle NaN values and ensure the data type is string
filtered_data['selftext'] = filtered_data['selftext'].fillna('')
# Remove punctuation, numbers, and special characters
filtered_data['clean_selftext'] = filtered_data['selftext'].map(lambda x: re.sub('[,\.!?]', '', x))
filtered_data['clean_selftext'] = filtered_data['clean_selftext'].map(lambda x: re.sub('\d+', '', x))
# Convert to lowercase
filtered_data['clean_selftext'] = filtered_data['clean_selftext'].map(lambda x: x.lower())


In [37]:
# Remove stop words and perform lemmatization

# Add custom stopwords
stop_words = set(stopwords.words('english'))
custom_stopwords = ["i'm","im", "i've", "im", "ive","hi","etc","would","want","get","Äôt"]
stop_words = stop_words.union(custom_stopwords)

exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop_words])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

#stop = set(stopwords.words('english'))
#exclude = set(string.punctuation) 
#lemma = WordNetLemmatizer()
# def clean(doc):
#     stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
#     punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
#     normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
#     return normalized

clean_text = [clean(doc).split() for doc in filtered_data['clean_selftext']]

# Creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(clean_text)

# Converting list of documents (corpus) into Document Term Matrix using the dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_text]

In [None]:
#Deciding on what value of num_topic to choose for the most optimal results

In [38]:
# 4. LDA Modeling
# Creating the object for LDA model using gensim library & Training LDA model on the document term matrix
ldamodel = gensim.models.ldamodel.LdaModel(doc_term_matrix, num_topics=8, id2word = dictionary, passes=50)


In [39]:
# 5. Result Interpretation
# Print the topics and the weights of words
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.015*"food" + 0.010*"stamp" + 0.009*"household" + 0.007*"gt" + 0.007*"resource" + 0.007*"california" + 0.006*"limit" + 0.005*"account" + 0.005*"class" + 0.005*"income"')
(1, '0.014*"california" + 0.008*"food" + 0.007*"snap" + 0.007*"income" + 0.007*"county" + 0.007*"month" + 0.006*"calfresh" + 0.006*"time" + 0.006*"need" + 0.005*"like"')
(2, '0.023*"california" + 0.016*"benefit" + 0.012*"food" + 0.012*"income" + 0.011*"calfresh" + 0.011*"i’m" + 0.010*"county" + 0.009*"pay" + 0.008*"live" + 0.008*"stamp"')
(3, '0.023*"card" + 0.016*"california" + 0.013*"ebt" + 0.013*"month" + 0.012*"know" + 0.010*"benefit" + 0.009*"food" + 0.009*"back" + 0.008*"case" + 0.008*"got"')
(4, '0.021*"benefit" + 0.018*"month" + 0.014*"income" + 0.012*"received" + 0.011*"calfresh" + 0.009*"got" + 0.009*"call" + 0.009*"report" + 0.008*"california" + 0.007*"th"')
(5, '0.010*"ebt" + 0.008*"use" + 0.007*"california" + 0.007*"day" + 0.006*"need" + 0.006*"month" + 0.006*"say" + 0.006*"benefit" + 0.006*"one" + 0