In [None]:
pip install google_play_scraper pandas matplotlib seaborn scikit-learn spacy torch transformers

In [None]:
# src/scrape.py
import pandas as pd
import pathlib
import time
import random
import logging
from google_play_scraper import app, reviews, Sort

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scrape.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# UPDATED WITH CORRECT PACKAGE NAMES
APPS = {
    "Commercial Bank": "com.combanketh.mobilebanking",

    "Bank of Abyssinia": "com.boa.boaMobileBanking",
        "Dashen Bank": "com.dashen.dashensuperapp"
}

# Configuration
REVIEW_LIMIT = 400  # Reduced limit for testing
BATCH_SIZE = 100
MIN_DELAY = 2.0
MAX_RETRIES = 3

def verify_app(pkg):
    """Verify app exists and return its title"""
    try:
        app_info = app(pkg)
        return True, app_info['title']
    except Exception as e:
        logger.error(f"Failed to verify {pkg}: {str(e)}")
        return False, None

def fetch_reviews(pkg, limit=REVIEW_LIMIT):
    """Fetch reviews with robust error handling"""
    all_reviews = []
    token = None
    attempts = 0
    
    while len(all_reviews) < limit and attempts < MAX_RETRIES:
        try:
            # Random delay to avoid detection
            delay = random.uniform(MIN_DELAY, MIN_DELAY * 2)
            time.sleep(delay)
            
            # Try different sorting methods
            sort_method = Sort.NEWEST if attempts < 2 else Sort.MOST_RELEVANT
            
            batch, token = reviews(
                pkg,
                lang='en',
                country='et',  # Ethiopia
                sort=sort_method,
                count=BATCH_SIZE,
                continuation_token=token
            )
            
            if not batch:
                logger.warning(f"No reviews in batch for {pkg}")
                attempts += 1
                continue
                
            all_reviews.extend(batch)
            logger.info(f"{pkg}: Added {len(batch)} reviews (Total: {len(all_reviews)})")
            
            if not token:
                break
                
        except Exception as e:
            logger.warning(f"Attempt {attempts + 1} failed: {str(e)}")
            attempts += 1
            time.sleep(5 * attempts)  # Exponential backoff
    
    return all_reviews[:limit]

def main():
    """Main scraping workflow"""
    output_dir = pathlib.Path("data/raw")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    all_reviews = []
    
    for bank_name, pkg in APPS.items():
        logger.info(f"\n{'='*40}\nProcessing {bank_name} ({pkg})\n{'='*40}")
        
        # Verify app first
        exists, title = verify_app(pkg)
        if not exists:
            logger.error(f"Skipping {bank_name} - verification failed")
            continue
            
        logger.info(f"Verified: {title}")
        
        # Fetch reviews
        try:
            bank_reviews = fetch_reviews(pkg)
            if not bank_reviews:
                logger.warning(f"No reviews obtained for {bank_name}")
                continue
                
            # Process reviews
            processed = [{
                "bank": bank_name,
                "title": title,
                "review_id": r.get("reviewId"),
                "content": r.get("content"),
                "rating": r.get("score"),
                "date": r.get("at").strftime("%Y-%m-%d") if r.get("at") else None,
                "thumbs_up": r.get("thumbsUpCount", 0),
                "version": r.get("reviewCreatedVersion")
            } for r in bank_reviews]
            
            all_reviews.extend(processed)
            logger.info(f"Successfully processed {len(processed)} reviews for {bank_name}")
            
        except Exception as e:
            logger.error(f"Error processing {bank_name}: {str(e)}")
    
    # Save results
    if all_reviews:
        df = pd.DataFrame(all_reviews)
        csv_path = output_dir / "ethiopian_bank_reviews.csv"
        df.to_csv(csv_path, index=False)
        logger.info(f"Saved {len(df)} reviews to {csv_path}")
    else:
        logger.error("No reviews were collected")

if __name__ == "__main__":
    logger.info("Starting Ethiopian Bank Reviews Scraper")
    start_time = time.time()
    main()
    logger.info(f"Completed in {time.time() - start_time:.2f} seconds")

2025-06-08 21:43:07,655 - INFO - Starting Ethiopian Bank Reviews Scraper
2025-06-08 21:43:07,656 - INFO - 
Processing Commercial Bank (com.combanketh.mobilebanking)
2025-06-08 21:43:10,341 - INFO - Verified: Commercial Bank of Ethiopia
2025-06-08 21:43:15,359 - INFO - com.combanketh.mobilebanking: Added 100 reviews (Total: 100)
2025-06-08 21:43:33,786 - INFO - com.combanketh.mobilebanking: Added 100 reviews (Total: 200)
