In [1]:
from google_play_scraper import app, Sort, reviews_all
import pandas as pd
from datetime import datetime
import time
import os
import json
from tqdm import tqdm  # For progress bars

# Configure scraping parameters
BANK_APPS = {
    'Commercial Bank of Ethiopia': 'com.cbe.cbe',
    'Bank of Abyssinia': 'com.boa.mobilebanking',
    'Dashen Bank': 'com.dashen.mobilebanking'
}

SCRAPE_SETTINGS = {
    'lang': 'en',               # Language
    'country': 'et',            # Country (Ethiopia)
    'sort': Sort.NEWEST,        # Sort order
    'sleep_milliseconds': 2000, # Delay between requests
}

def validate_app_id(app_id):
    """Check if app exists before scraping"""
    try:
        app_info = app(app_id)
        return True
    except:
        return False

def scrape_reviews(app_name, app_id, max_retries=3):
    for attempt in range(max_retries):
        try:
            print(f"\nAttempt {attempt + 1} for {app_name}...")
            
            # Get app info first for validation
            app_info = app(app_id)
            print(f"App found: {app_info['title']} (Version: {app_info['version']})")
            
            # Scrape reviews with progress indication
            print("Scraping reviews...")
            reviews = reviews_all(
                app_id,
                **SCRAPE_SETTINGS
            )
            
            if not reviews:
                print(f"No reviews found for {app_name}")
                return pd.DataFrame()
            
            df = pd.DataFrame(reviews)
            df['bank'] = app_name
            df['app_version'] = app_info['version']
            df['source'] = 'Google Play'
            
            # Process and clean data
            df = df[['content', 'score', 'at', 'bank', 'app_version', 'source']]
            df.columns = ['review', 'rating', 'date', 'bank', 'app_version', 'source']
            df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M:%S')
            
            print(f"Successfully scraped {len(df)} reviews for {app_name}")
            return df
            
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {app_name}: {str(e)}")
            if attempt < max_retries - 1:
                wait_time = (attempt + 1) * 5  # Exponential backoff
                print(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            else:
                return pd.DataFrame()

def save_metadata(app_data):
    """Save app metadata for reference"""
    os.makedirs('metadata', exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"metadata/app_metadata_{timestamp}.json"
    with open(filename, 'w') as f:
        json.dump(app_data, f, indent=2)
    print(f"\nApp metadata saved to {filename}")

if __name__ == "__main__":
    all_reviews = pd.DataFrame()
    app_metadata = {}
    
    print("Starting Google Play Store review scraping...")
    print(f"Target banks: {', '.join(BANK_APPS.keys())}\n")
    
    for bank_name, app_id in tqdm(BANK_APPS.items(), desc="Processing Banks"):
        if not validate_app_id(app_id):
            print(f"\nInvalid app ID for {bank_name}: {app_id}")
            continue
            
        bank_reviews = scrape_reviews(bank_name, app_id)
        
        if not bank_reviews.empty:
            all_reviews = pd.concat([all_reviews, bank_reviews], ignore_index=True)
            
        # Store app metadata
        try:
            app_info = app(app_id)
            app_metadata[bank_name] = {
                'app_id': app_id,
                'version': app_info.get('version', 'N/A'),
                'install_count': app_info.get('installs', 'N/A'),
                'average_rating': app_info.get('score', 'N/A'),
                'last_updated': app_info.get('updated', 'N/A')
            }
        except Exception as e:
            print(f"Could not fetch metadata for {bank_name}: {str(e)}")
    
    if not all_reviews.empty:
        # Save reviews
        os.makedirs('data', exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"data/bank_reviews_{timestamp}.csv"
        
        # Additional data cleaning
        all_reviews = all_reviews.drop_duplicates(subset=['review', 'bank'])
        all_reviews = all_reviews.sort_values(by=['bank', 'date'], ascending=[True, False])
        
        all_reviews.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"\nSuccessfully saved {len(all_reviews)} reviews to {filename}")
        
        # Save metadata
        save_metadata(app_metadata)
        
        # Print summary
        print("\nScraping Summary:")
        print(f"- Total reviews collected: {len(all_reviews)}")
        print("- Reviews per bank:")
        print(all_reviews['bank'].value_counts().to_string())
    else:
        print("\nNo reviews were scraped from any bank.")

Starting Google Play Store review scraping...
Target banks: Commercial Bank of Ethiopia, Bank of Abyssinia, Dashen Bank



Processing Banks:  33%|███▎      | 1/3 [00:01<00:03,  1.81s/it]


Invalid app ID for Commercial Bank of Ethiopia: com.cbe.cbe


Processing Banks:  67%|██████▋   | 2/3 [00:02<00:01,  1.35s/it]


Invalid app ID for Bank of Abyssinia: com.boa.mobilebanking


Processing Banks: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


Invalid app ID for Dashen Bank: com.dashen.mobilebanking

No reviews were scraped from any bank.





In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

def preprocess_reviews(input_file, output_file):
    # Read the scraped data
    df = pd.read_csv(input_file)
    
    # 1. Handle missing data
    print(f"Initial count: {len(df)}")
    
    # Drop rows with missing reviews
    df = df.dropna(subset=['review'])
    print(f"After dropping missing reviews: {len(df)}")
    
    # Fill missing ratings with neutral (3)
    df['rating'] = df['rating'].fillna(3)
    
    # 2. Remove duplicates
    df = df.drop_duplicates(subset=['review', 'bank'])
    print(f"After removing duplicates: {len(df)}")
    
    # 3. Normalize dates
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    
    # 4. Ensure proper data types
    df['rating'] = df['rating'].astype(int)
    
    # Save cleaned data
    df.to_csv(output_file, index=False)
    print(f"Saved cleaned data to {output_file}")

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 3:
        print("Usage: python preprocess_reviews.py <input_file> <output_file>")
    else:
        preprocess_reviews(sys.argv[1], sys.argv[2])

Usage: python preprocess_reviews.py <input_file> <output_file>
