In [8]:
from google_play_scraper import app, Sort, reviews_all
import pandas as pd
import os
import time
from datetime import datetime

# Ensure data folder exists
os.makedirs("data", exist_ok=True)

# Updated bank app details with verified package names
banks = {
    "Commercial Bank of Ethiopia": "com.combanketh.mobilebanking",
    "Bank of Abyssinia": "com.boa.boaMobileBanking", 
    "Dashen Bank": "com.dashen.dashensuperapp"
}

def scrape_app_reviews(app_id, bank_name, max_reviews=400):
    """
    Scrape reviews for a given app ID
    """
    try:
        # Get all available reviews (no continuation token in current version)
        all_reviews = reviews_all(
            app_id,
            lang='en',
            country='et',
            sort=Sort.NEWEST,
            count=max_reviews  # Limit directly in the API call
        )
        
        return all_reviews[:max_reviews]
    
    except Exception as e:
        print(f"Error fetching reviews for {bank_name}: {str(e)}")
        return []

def process_reviews(raw_reviews, bank_name):
    """
    Process raw reviews into structured format with error handling
    """
    processed = []
    
    for review in raw_reviews:
        try:
            review_date = review.get('at', datetime.now())
            if isinstance(review_date, str):
                review_date = datetime.strptime(review_date, '%Y-%m-%d %H:%M:%S')
                
            processed.append({
                'review': review.get('content', ''),
                'rating': review.get('score', 0),
                'date': review_date.strftime('%Y-%m-%d'),
                'bank': bank_name,
                'source': 'Google Play'
            })
        except Exception as e:
            print(f"Error processing review: {str(e)}")
            continue
            
    return processed

def main():
    all_reviews = []
    
    for bank_name, app_id in banks.items():
        print(f"\nScraping reviews for {bank_name}...")
        
        # Scrape reviews
        bank_reviews = scrape_app_reviews(app_id, bank_name)
        
        if not bank_reviews:
            print(f"No reviews found for {bank_name}")
            continue
            
        # Process reviews
        processed = process_reviews(bank_reviews, bank_name)
        all_reviews.extend(processed)
        print(f"Successfully collected {len(processed)} reviews for {bank_name}")
        
        # Rate limiting
        time.sleep(5)
    
    if not all_reviews:
        print("\nFailed to collect any reviews. Exiting.")
        return
        
    # Create DataFrame
    df = pd.DataFrame(all_reviews)
    
    # Data cleaning
    df = df.drop_duplicates(subset=['review', 'bank'])
    df = df[df['review'].notna() & (df['review'].str.strip() != '')]
    
    print(f"\nTotal reviews collected: {len(df)}")
    
    # Save to CSV
    csv_path = "data/bank_reviews_raw.csv"
    df.to_csv(csv_path, index=False)
    print(f"Data saved to {csv_path}")

if __name__ == "__main__":
    main()


Scraping reviews for Commercial Bank of Ethiopia...
Successfully collected 400 reviews for Commercial Bank of Ethiopia

Scraping reviews for Bank of Abyssinia...
Successfully collected 400 reviews for Bank of Abyssinia

Scraping reviews for Dashen Bank...
Successfully collected 400 reviews for Dashen Bank

Total reviews collected: 1180
Data saved to data/bank_reviews_raw.csv


In [9]:
import pandas as pd

raw_df = pd.read_csv("data/bank_reviews_raw.csv")
clean_df = pd.read_csv("data/bank_reviews_clean.csv")

print("Raw Data Columns:", raw_df.columns)
print("Clean Data Columns:", clean_df.columns)

Raw Data Columns: Index(['review', 'rating', 'date', 'bank', 'source'], dtype='object')
Clean Data Columns: Index(['review', 'rating', 'date', 'bank', 'source', 'cleaned_review'], dtype='object')


In [10]:
print(raw_df[['review', 'date']].head())  # Original raw text and date
print(clean_df[['cleaned_review', 'date']].head())  # Processed version

                                              review        date
0  The CBE app has been highly unreliable in rece...  2025-05-25
1  this new update(Mar 19,2025) is great in fixin...  2025-03-20
2  Good job to the CBE team on this mobile app! I...  2025-04-04
3  this app has developed in a very good ways but...  2025-05-31
4  as if the update of march 19 i can't take a sc...  2025-03-19
                                      cleaned_review        date
0  cbe app highly unreliable recent weeks frequen...  2025-05-25
1  new updatemar great fixing bugs stability smoo...  2025-03-20
2  good job cbe team mobile app designed way that...  2025-04-04
3  app developed good ways comments need make pre...  2025-05-31
4  update march cant take screenshot app save fil...  2025-03-19


In [11]:
import pandas as pd

df = pd.read_csv("data/bank_reviews_clean.csv")
print(df.head())  # Displays first few rows as a table

                                              review  rating        date  \
0  The CBE app has been highly unreliable in rece...       2  2025-05-25   
1  this new update(Mar 19,2025) is great in fixin...       4  2025-03-20   
2  Good job to the CBE team on this mobile app! I...       5  2025-04-04   
3  this app has developed in a very good ways but...       5  2025-05-31   
4  as if the update of march 19 i can't take a sc...       2  2025-03-19   

                          bank       source  \
0  Commercial Bank of Ethiopia  Google Play   
1  Commercial Bank of Ethiopia  Google Play   
2  Commercial Bank of Ethiopia  Google Play   
3  Commercial Bank of Ethiopia  Google Play   
4  Commercial Bank of Ethiopia  Google Play   

                                      cleaned_review  
0  cbe app highly unreliable recent weeks frequen...  
1  new updatemar great fixing bugs stability smoo...  
2  good job cbe team mobile app designed way that...  
3  app developed good ways comments need