In [16]:
#Author Tanmay

import pandas as pd
from gensim import corpora, models
import gensim
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import CoherenceModel
import string
from wordcloud import WordCloud

In [55]:
# 1. Data Loading
data = pd.read_csv('foodstamp_submissions_allyears.csv')

#2 Preprocessing

filtered_data = data[
    ~data['selftext'].str.contains('\[deleted\]', case=False, na=False) &
    ~data['selftext'].str.contains('\[removed\]', case=False, na=False)
]

# 3. Data Preprocessing
# Handle NaN values and ensure the data type is string
filtered_data['selftext'] = filtered_data['selftext'].fillna('')
#Removing any links
filtered_data['selftext'] = filtered_data['selftext'].str.replace(r'http\S+', '', regex=True)
# Remove punctuation, numbers, and special characters
filtered_data['clean_selftext'] = filtered_data['selftext'].map(lambda x: re.sub('[,\.!?]', '', x))
filtered_data['clean_selftext'] = filtered_data['clean_selftext'].map(lambda x: re.sub('\d+', '', x))
# Convert to lowercase
filtered_data['clean_selftext'] = filtered_data['clean_selftext'].map(lambda x: x.lower())


In [56]:
#Creating a list from the dataframe as BERT expects list of docs as inputs
docs = filtered_data['clean_selftext'].tolist()

# Load English stopwords
stop_words = set(stopwords.words('english'))

custom_stopwords = ["i'm","im", "i've", "im", "ive","hi","etc","would","want"
,"get","Äôt","ampxb","thanks","i’m","got","th","irt","san","ca","u","cal","calfresh"
,"los","angeles","told","said","doesnt","s","k","snap","still","lkjghfdsretrytukrltioyulk"
"rexdtrfuygihojpojihugygtfchjukijlop","gtgt","chillin"]

all_stopwords = stop_words.union(set(custom_stopwords))

# Function to remove stopwords from a document
def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in all_stopwords]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Apply the function to your documents
docs_no_stopwords = [remove_stopwords(doc) for doc in docs]


In [64]:
from bertopic import BERTopic

# Create a BERTopic instance
#topic_model = BERTopic(min_topic_size=20,nr_topics="auto")

topic_model = BERTopic()
# Fit the model and transform your documents into topics
topics, probs = topic_model.fit_transform(docs_no_stopwords)


In [65]:
# Get an overview of the topics
topic_model.get_topic_info()

# Retrieve individual topics
#topic_model.get_topic(0)  # Replace 0 with the topic number you're interested in


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,206,-1_gtgt_looms_cliff_,"[gtgt, looms, cliff, , , , , , , ]","[, [cliff looms](, ]"
1,0,5586,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[female diagnosed generalized anxiety disorder...
2,1,59,1_california_diego_county_texas,"[california, diego, county, texas, state, utah...","[california diego county), state california, a..."
3,2,36,2____,"[, , , , , , , , , ]","[, , ]"
4,3,30,3_title_end_suppose_say,"[title, end, suppose, say, anyone, lkjghfdsret...","[question title, title says, title says]"
5,4,25,4_ckwe_recommendation_joined_too,"[ckwe, recommendation, joined, too, subreddit,...","[, , (i joined seeing recommendation old post ..."
6,5,21,5____,"[, , , , , , , , , ]","[, , ]"
7,6,18,6____,"[, , , , , , , , , ]","[, , ]"
8,7,15,7____,"[, , , , , , , , , ]","[, , ]"
9,8,15,8____,"[, , , , , , , , , ]","[, , ]"


In [66]:
topic_model.get_topic(3)

[('title', 0.22474088887009774),
 ('end', 0.10932443484529589),
 ('suppose', 0.0983784975823064),
 ('say', 0.09318598630321255),
 ('anyone', 0.09126005091979082),
 ('lkjghfdsretrytukrltioyulk', 0.0865092306448219),
 ('ghfkuihlkjkl', 0.0865092306448219),
 ('rexdtrfuygihojpojihugygtfchjukijlop', 0.0865092306448219),
 ('chillin', 0.0865092306448219),
 ('anybody', 0.08118035233392178)]

In [62]:
# Visualize topics
topic_model.visualize_topics()


In [63]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,hi im wondering can you be married and still g...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
1,not sure if this is only pa but i'm sure it co...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
2,today i was informed by my social worker that ...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
3,(i'm in alabama)\n\nso i filled out the online...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
4,hello\n\ni am in new york\n\ni applied on apri...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
...,...,...,...,...,...,...,...,...
6337,i recently got approved for early learning coa...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
6338,hello\n\ni really hate to bother asking here b...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
6339,a friend of mine receives ssi payments for her...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,1.000000,False
6340,i looked outside my door and there was a card ...,0,0_food_benefits_month_income,"[food, benefits, month, income, card, know, st...",[applied call next day hhs saying bills income...,food - benefits - month - income - card - know...,0.986374,False
