In [1]:
!pip install google-play-scraper pandas tqdm


Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, google-play-scraper
Successfully installed google-play-scraper-1.2.7 tqdm-4.67.1



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from google_play_scraper import Sort, reviews
import pandas as pd
from tqdm import tqdm

def fetch_reviews(app_id, bank_name, lang='en', country='us', total=400):
    all_reviews = []
    batch_size = 100
    fetched = 0

    print(f"Fetching reviews for {bank_name}...")

    while fetched < total:
        result, _ = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.NEWEST,
            count=min(batch_size, total - fetched),
            filter_score_with=None  # Get all ratings
        )
        for review in result:
            all_reviews.append({
                'review': review['content'],
                'rating': review['score'],
                'date': review['at'].date(),
                'bank': bank_name,
                'source': 'Google Play'
            })
        fetched += len(result)
        if not result:
            break  # Stop if no more reviews available

    return pd.DataFrame(all_reviews)


In [11]:
from google_play_scraper import Sort, reviews
import pandas as pd

def fetch_reviews(app_id, bank_name, total=100):
    all_reviews = []
    count = 0
    next_token = None

    while count < total:
        fetched, next_token = reviews(
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=100,  # Try fetching in 100s
            continuation_token=next_token
        )
        if not fetched:
            break

        for r in fetched:
            all_reviews.append({
                'review': r.get('content'),
                'rating': r.get('score'),
                'date': r.get('at'),
                'bank': bank_name,
                'source': 'Google Play'
            })
        count += len(fetched)

    df = pd.DataFrame(all_reviews)
    return df

# 🔍 Test with CBE App ID (Make sure this ID is correct)
df_cbe = fetch_reviews('com.combanketh.mobilebanking', 'CBE', total=200)
print(df_cbe.shape)
df_cbe.head()


(200, 5)


Unnamed: 0,review,rating,date,bank,source
0,really am happy to this app it is Siple to use...,5,2025-06-07 01:02:38,CBE,Google Play
1,I liked this app. But the User interface is ve...,2,2025-06-07 00:50:29,CBE,Google Play
2,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-05 23:54:11,CBE,Google Play
3,what is this app problem???,1,2025-06-05 12:16:56,CBE,Google Play
4,the app is proactive and a good connections.,5,2025-06-05 05:55:10,CBE,Google Play


In [12]:
# Fetch reviews for all 3 banks
df_cbe = fetch_reviews('com.combanketh.mobilebanking', 'CBE')
df_boa = fetch_reviews('com.boa.boaMobileBanking&hl', 'BOA')
df_dashen = fetch_reviews('com.dashen.dashensuperapp&hl', 'Dashen')

# Combine and check
df_all = pd.concat([df_cbe, df_boa, df_dashen], ignore_index=True)
print(df_all.shape)
df_all.head()


(100, 5)


Unnamed: 0,review,rating,date,bank,source
0,really am happy to this app it is Siple to use...,5,2025-06-07 01:02:38,CBE,Google Play
1,I liked this app. But the User interface is ve...,2,2025-06-07 00:50:29,CBE,Google Play
2,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-05 23:54:11,CBE,Google Play
3,what is this app problem???,1,2025-06-05 12:16:56,CBE,Google Play
4,the app is proactive and a good connections.,5,2025-06-05 05:55:10,CBE,Google Play


In [14]:
# Remove duplicates
df_all.drop_duplicates(subset='review', inplace=True)

# Drop empty reviews
df_all.dropna(subset=['review'], inplace=True)

# Normalize date
df_all['date'] = pd.to_datetime(df_all['date']).dt.strftime('%Y-%m-%d')

df_all.reset_index(drop=True, inplace=True)
df_all.to_csv('../data/bank_reviews_clean.csv', index=False)

print("✅ Cleaned data saved to data/bank_reviews_clean.csv")


✅ Cleaned data saved to data/bank_reviews_clean.csv


In [15]:
df_all['bank'].value_counts()


bank
CBE    86
Name: count, dtype: int64