In [5]:
import json
from collections import defaultdict

with open('aus_cost_of_living_random_keywords.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

city_sentiment = defaultdict(lambda: {'positive': 0, 'neutral': 0, 'negative': 0})


cost_keywords = {'cost', 'rent', 'price', 'grocery', 'transport', 'bill', 'living', 'wage', 'income', 'expensive', 'afford'}


cost_posts = []

for post in data:
    tags = post.get('data', {}).get('tags', []) + post.get('keywords', [])
    cities = [tag.lower() for tag in tags if tag.lower() in {"melbourne", "sydney", "brisbane", "adelaide", "perth", "hobart", "darwin", "canberra"}]

    sentiment = post.get('sentimentLabel', 'neutral').lower()
    for city in cities:
        city_sentiment[city][sentiment] += 1

    content = post.get('data', {}).get('content', '').lower()
    if any(keyword in content for keyword in cost_keywords):
        cost_posts.append(post)

city_ranking = []
for city, stats in city_sentiment.items():
    total = sum(stats.values())
    positive_ratio = stats['positive'] / total if total > 0 else 0
    city_ranking.append((city, positive_ratio))

city_ranking = sorted(city_ranking, key=lambda x: -x[1])

selected_posts = sorted(
    [p for p in cost_posts if 'melbourne' in p.get('keywords', []) or 'brisbane' in p.get('keywords', [])],
    key=lambda x: (-abs(x.get('sentiment', 0)), 
    (x.get('favouritesCount', 0) + x.get('reblogsCount', 0))))[:5]

print("sentiment rank：")
for idx, (city, score) in enumerate(city_ranking, 1):
    print(f"{idx}. {city.capitalize()}: {score:.1%} positive")

print("\ntop5 posts：")
for idx, post in enumerate(selected_posts, 1):
    print(f"\n{idx}. [city: {post['keywords'][0].capitalize()}]")
    print(f"   sentiment label: {post['sentimentLabel']} ({post['sentiment']})")
    print(f"   content: {post['data']['content'][:150]}...")
    print(f"   url: {post['data']['url']}")

sentiment rank：
1. Melbourne: 29.7% positive
2. Perth: 27.9% positive
3. Brisbane: 27.9% positive
4. Canberra: 24.6% positive
5. Darwin: 21.5% positive
6. Adelaide: 20.3% positive
7. Sydney: 15.8% positive
8. Hobart: 11.0% positive

top5 posts：

1. [city: Melbourne]
   sentiment label: positive (0.836)
   content: <p>Something a lot of <a href="https://aus.social/tags/melbourne" class="mention hashtag" rel="nofollow noopener" target="_blank">#<span>melbourne</sp...
   url: https://aus.social/@joannaholman/114386272508875163

2. [city: Queensland]
   sentiment label: positive (0.712)
   content: <p>Day 22 cont 🚜🌾 🤼 🏡🏘️🏠🏠🏠</p><p>“Traditionally, <a href="https://ioc.exchange/tags/Queensland" class="mention hashtag" rel="nofollow noopener" target...
   url: https://ioc.exchange/@peterrenshaw/114368240128545286

3. [city: Neighbourhoods]
   sentiment label: positive (0.619)
   content: <p>In case you're confused about why there's such strident opposition against the Allan government's plan 

In [None]:
import json
from collections import defaultdict
from datetime import datetime

with open('aus_cost_of_living_random_keywords.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

city_list = {"melbourne", "sydney", "brisbane", "adelaide", "perth", "hobart", "darwin", "canberra"}

# city_sentiment_by_year[city][year]['positive'/'neutral'/'negative'] = count
city_sentiment_by_year = defaultdict(lambda: defaultdict(lambda: {'positive': 0, 'neutral': 0, 'negative': 0}))
cost_keywords = {
    'cost', 'costs', 'price', 'prices', 'expensive', 'afford', 'affordable', 'cheap', 'cheaper',
    'living', 'living cost', 'living costs', 'cost of living', 'standard of living',
    'rent', 'rents', 'rental', 'renting', 'lease', 'leased',
    'grocery', 'groceries', 'supermarket', 'supermarkets', 'shopping', 'food cost', 'food prices',
    'transport', 'transportation', 'public transport', 'train fare', 'bus fare', 'uber', 'taxi', 'commute', 'commuting',
    'bill', 'bills', 'electricity bill', 'water bill', 'internet bill', 'gas bill', 'utilities', 'utility bill', 'phone bill',
    'income', 'wage', 'wages', 'salary', 'salaries', 'pay', 'payment',
    'job', 'jobs', 'unemployment', 'employment', 'underemployment',
    'mortgage', 'home loan', 'interest rate', 'housing', 'house price', 'real estate', 'property', 'housing market',
    'inflation', 'recession', 'economy', 'economic', 'financial', 'finance', 'crisis',
    'insurance', 'health insurance', 'car insurance', 'life insurance',
    'childcare', 'daycare', 'school fees', 'education cost', 'university fees', 'student debt',
    'healthcare', 'medical cost', 'hospital bill', 'doctor fee',
    'minimum wage', 'living wage', 'pay rise', 'salary increase', 'salary cut',
    'saving', 'savings', 'saving money', 'expense', 'expenses', 'spending',
    'cost crisis', 'housing crisis', 'cost pressure', 'affordability crisis'
}

city_cost_posts_by_year = defaultdict(lambda: defaultdict(list))

for post in data:
    tags = post.get('data', {}).get('tags', []) + post.get('keywords', [])
    tags = [tag.lower() for tag in tags]

    cities = [tag for tag in tags if tag in city_list]
    sentiment = post.get('sentimentLabel', 'neutral').lower()

    created_at = post.get('data', {}).get('createdAt')
    if created_at:
        try:
            created_time = datetime.fromisoformat(created_at.replace('Z', '+00:00'))  # Handle ISO format
            year = created_time.year
        except Exception as e:
            continue
    else:
        continue

    content = post.get('data', {}).get('content', '').lower()

    if any(keyword in content for keyword in cost_keywords):
        for city in cities:
            city_cost_posts_by_year[city][year].append(post)

    for city in cities:
        city_sentiment_by_year[city][year][sentiment] += 1

for city in sorted(city_list):
    print(f"\nCity: {city.capitalize()}")
    yearly_stats = city_sentiment_by_year[city]
    for year in sorted(yearly_stats.keys()):
        stats = yearly_stats[year]
        total = sum(stats.values())
        if total > 0:
            positive_ratio = stats['positive'] / total
            neutral_ratio = stats['neutral'] / total
            negative_ratio = stats['negative'] / total
            print(f"  Year {year}: Positive {positive_ratio:.1%} | Neutral {neutral_ratio:.1%} | Negative {negative_ratio:.1%}")
        else:
            print(f"  Year {year}: No data.")

        posts = city_cost_posts_by_year[city][year]
        if posts:
            top_posts = sorted(
                posts,
                key=lambda x: (-abs(x.get('sentiment', 0)),
                               -(x.get('favouritesCount', 0) + x.get('reblogsCount', 0))))[:5]
            print(f"    Top {len(top_posts)} representative posts:")
            for idx, post in enumerate(top_posts, 1):
                content = post.get('data', {}).get('content', '').strip()
                tags = post.get('data', {}).get('tags', []) + post.get('keywords', [])
                tags = [tag.lower() for tag in tags if tag]
                print(f"      {idx}. Content: {content[:120]}...")
                print(f"         Keywords: {', '.join(tags)}")
        else:
            print(f"    No related posts.")



City: Adelaide

City: Brisbane

City: Canberra

City: Darwin

City: Hobart

City: Melbourne

City: Perth

City: Sydney
