In [None]:
from google_play_scraper import reviews, Sort
import pandas as pd
import random
import time

# Only Alodokter and Halodoc
apps = {
    "Halodoc": "com.linkdokter.halodoc.android",
    "Alodokter": "com.alodokter.android"
}

# Each app will contribute exactly 600 reviews per rating (total 3,000 per app)
target_per_rating = 800

def scrape_app_balanced(app_name, app_id):
    print(f"\n🔍 Scraping {app_name}...")
    continuation_token = None
    app_reviews = {1: [], 2: [], 3: [], 4: [], 5: []}

    while any(len(app_reviews[r]) < target_per_rating for r in range(1, 6)):
        result, continuation_token = reviews(
            app_id,
            lang='id',
            country='id',
            sort=Sort.NEWEST,
            count=200,
            continuation_token=continuation_token
        )

        if not result:
            print(f"⚠️ No more reviews for {app_name}.")
            break

        for r in result:
            rating = r["score"]
            if len(app_reviews[rating]) < target_per_rating:
                app_reviews[rating].append({
                    "application": app_name,
                    "review": r["content"],
                    "rating": rating
                })

        print(f"{app_name} rating counts: {[len(app_reviews[r]) for r in range(1, 6)]}")
        if not continuation_token:
            break
        time.sleep(random.uniform(1.0, 2.0))

    combined_reviews = sum(app_reviews.values(), [])
    print(f"✅ {app_name} total reviews collected: {len(combined_reviews)}")
    return combined_reviews

# Collect balanced reviews separately
all_reviews = []
for app_name, app_id in apps.items():
    reviews_data = scrape_app_balanced(app_name, app_id)
    all_reviews.extend(reviews_data)

# Shuffle and save
df = pd.DataFrame(all_reviews)
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv("alodokter_halodoc_balanced_8000.csv", index=False)

print(f"\n🎉 Final dataset saved: {len(df)} reviews (4,000 per app, 800 per rating)")


🔍 Scraping Halodoc...
Halodoc rating counts: [16, 1, 2, 14, 167]
Halodoc rating counts: [35, 1, 5, 18, 341]
Halodoc rating counts: [58, 4, 10, 30, 498]
Halodoc rating counts: [72, 8, 12, 36, 672]
Halodoc rating counts: [90, 10, 14, 44, 800]
Halodoc rating counts: [111, 14, 18, 54, 800]
Halodoc rating counts: [123, 15, 23, 63, 800]
Halodoc rating counts: [138, 19, 30, 71, 800]
Halodoc rating counts: [154, 22, 35, 84, 800]
Halodoc rating counts: [162, 27, 39, 96, 800]
Halodoc rating counts: [176, 33, 39, 112, 800]
Halodoc rating counts: [189, 34, 42, 122, 800]
Halodoc rating counts: [206, 36, 47, 132, 800]
Halodoc rating counts: [216, 39, 48, 147, 800]
Halodoc rating counts: [229, 41, 53, 156, 800]
Halodoc rating counts: [243, 45, 54, 169, 800]
Halodoc rating counts: [257, 48, 57, 181, 800]
Halodoc rating counts: [272, 49, 59, 190, 800]
Halodoc rating counts: [281, 53, 61, 201, 800]
Halodoc rating counts: [293, 57, 62, 215, 800]
Halodoc rating counts: [304, 62, 65, 224, 800]
Halodoc rat

In [None]:
pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7
