In [1]:
import json
import re
from collections import Counter

# Read JSON
with open("haunted_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract all description fields and merge them into one large text
text = " ".join(item.get("description", "") for item in data)

# Clean up the text: remove punctuation, lower caseization
text = re.sub(r"[^a-zA-Z\s]", "", text).lower()

# Split into words
words = text.split()

# Set up stop words
stopwords = set("""
a about above after again against all also am an and any are arent as at be because been before being below between
both but by can cant cannot could couldnt did didnt do does doesnt doing dont down during each even few for from further get
had hadnt has hasnt have havent having he hed hell hes her here heres hers herself him himself his how hows i id
ill im ive if in into is isnt it its itself lets like me more most mustnt my myself no nor not now of off on once one only or
other ought our ours ourselves out over own same shant she shed shell shes should shouldnt so some still such than that
thats the their theirs them themselves then there theres these they theyd theyll theyre theyve this those through
to too under until up very was wasnt we wed well were weve what whats when whens where wheres which while who whos
whom why whys will with wont would wouldnt you youd youll youre youve your yours yourself yourselves
""".split())

# Filtering: remove stop words + too short words
filtered_words = [word for word in words if word not in stopwords and len(word) > 2]

# Count word frequency
counter = Counter(filtered_words)

# Top 100 high-frequency words
top_words = counter.most_common(100)

# Convert to D3 word cloud format
wordcloud_data = [{"text": word, "size": freq * 2} for word, freq in top_words]

# Write JSON file
with open("wordcloud_data.json", "w", encoding="utf-8") as f:
    json.dump(wordcloud_data, f, indent=2)

print("Clean word cloud data has been generated：wordcloud.json")


Clean word cloud data has been generated：wordcloud.json
