In [2]:
from google_play_scraper import Sort, reviews
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from google.colab import drive

# Download Lexicon VADER
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [6]:
# Inisialisasi VADER Sentiment Analyzer
sid = SentimentIntensityAnalyzer()

# Kamus kata negatif tambahan (Bahasa Indonesia)
negative_words = [
    "gak bisa", "tidak bisa", "parah", "jelek", "lemot", "lambat",
    "buruk", "error", "failed", "sampah", "payah", "kecewa", "tidak puas",
    "sulit", "nyusahin", "kurang bagus", "tidak nyaman", "force close",
    "bug", "tidak jalan", "tidak berfungsi", "tidak responsif", "crash"
]

# Kamus kata positif tambahan (Bahasa Indonesia)
positive_words = [
    "bagus", "mantap", "cepat", "puas", "baik", "mudah", "sukses",
    "terbaik", "membantu", "nyaman", "lancar"
]

# Kata saran atau keluhan
suggestion_words = ["tolong", "mohon", "harap", "sebaiknya", "perbaiki", "harusnya"]

def scrape_reviews(app_id, max_reviews=20000):
    """Mengambil review dari Google Play Store"""
    raw_reviews = []
    continuation_token = None

    while len(raw_reviews) < max_reviews:
        reviews_batch, continuation_token = reviews(
            app_id,
            lang='id',
            country='id',
            sort=Sort.NEWEST,
            count=1000,
            continuation_token=continuation_token,
        )

        raw_reviews.extend(reviews_batch)

        if continuation_token is None:
            break

    return raw_reviews

def lexicon_based_label(text):
    sentiment_scores = sid.polarity_scores(text)
    text_lower = text.lower()

    contains_negative = any(word in text_lower for word in negative_words)
    contains_positive = any(word in text_lower for word in positive_words)
    contains_suggestion = any(word in text_lower for word in suggestion_words)

    if contains_negative:
        return "negative"
    if contains_suggestion and contains_negative:
        return "negative"
    if contains_positive and contains_negative:
        return "negative"
    if contains_positive:
        return "positive"
    if -0.05 <= sentiment_scores['compound'] <= 0.05:
        return "neutral"
    return "neutral"

if __name__ == "__main__":
    app_id = 'com.bca'
    max_reviews = 20000

    # Scraping data mentah
    raw_reviews = scrape_reviews(app_id, max_reviews)

    # Konversi ke DataFrame
    df = pd.DataFrame(raw_reviews)

    # Terapkan fungsi labeling
    df['label'] = df['content'].apply(lexicon_based_label)

    # Pisahkan review berdasarkan label
    positive_reviews = df[df['label'] == "positive"]
    negative_reviews = df[df['label'] == "negative"]
    neutral_reviews = df[df['label'] == "neutral"]

    # Tentukan jumlah sampel netral dengan median jumlah kelas lainnya
    min_count = min(len(positive_reviews), len(negative_reviews))
    neutral_sample_count = int(np.median([len(positive_reviews), len(negative_reviews)]))

    # Sampling agar jumlah lebih seimbang
    balanced_positive = positive_reviews.sample(n=min_count, random_state=42)
    balanced_negative = negative_reviews.sample(n=min_count, random_state=42)
    balanced_neutral = neutral_reviews.sample(n=neutral_sample_count, random_state=42)

    # Gabungkan dataset
    balanced_df = pd.concat([balanced_positive, balanced_negative, balanced_neutral]).sample(frac=1, random_state=42)

    # Simpan ke CSV
    balanced_df.to_csv('bca_reviews_balanced.csv', index=False, encoding='utf-8')

    # Tampilkan distribusi label setelah balancing
    print("Distribusi label setelah balancing:")
    print(balanced_df['label'].value_counts())

Distribusi label setelah balancing:
label
neutral     5375
positive    4726
negative    4726
Name: count, dtype: int64
