# Task 1: Data Collection and Preprocessing

#### Name: Tadele Bizuye

This notebook handles the scraping and preprocessing of mobile banking app reviews from the Google Play Store for:

- Commercial Bank of Ethiopia (CBE)
- Bank of Abyssinia (BOA)
- Dashen Bank

## Objectives:
- Scrape 400+ reviews per bank
- Clean and preprocess the data
- Store cleaned data in CSV format
- Maintain version control with Git


In [2]:
# Import Neccessary Liberarys
import pandas as pd
from google_play_scraper import Sort, reviews
from datetime import datetime


In [3]:
from google_play_scraper import reviews, reviews_all
import pandas as pd
import time

def fetch_reviews(app_id, bank_name, n_reviews=400):
    all_reviews = []
    token = None

    while len(all_reviews) < n_reviews:
        review_batch, token = reviews(
            app_id,
            count=200,
            continuation_token=token
        )
        if not review_batch:
            print(f"[{bank_name}] No more reviews returned.")
            break

        for r in review_batch:
            all_reviews.append({
                'review': r.get('content'),
                'rating': r.get('score'),
                'date': r.get('at').strftime('%Y-%m-%d') if r.get('at') else None,
                'bank': bank_name,
                'source': 'Google Play'
            })
        print(f"[{bank_name}] Collected {len(all_reviews)} reviews so far...")
        time.sleep(1)

    print(f"[{bank_name}] Final count: {len(all_reviews)} reviews")
    return all_reviews[:n_reviews]

In [4]:
# Try them:
cbe_reviews = fetch_reviews('com.combanketh.mobilebanking', 'CBE')
boa_reviews = fetch_reviews('com.boa.boaMobileBanking', 'BOA')
dashen_reviews = fetch_reviews('com.dashen.dashensuperapp', 'Dashen')

[CBE] Collected 200 reviews so far...
[CBE] Collected 400 reviews so far...
[CBE] Final count: 400 reviews
[BOA] Collected 200 reviews so far...
[BOA] Collected 400 reviews so far...
[BOA] Final count: 400 reviews
[Dashen] Collected 200 reviews so far...
[Dashen] Collected 400 reviews so far...
[Dashen] Final count: 400 reviews


In [5]:

# Combine
df = pd.DataFrame(cbe_reviews + boa_reviews + dashen_reviews)

# Drop duplicates and missing values
df.drop_duplicates(subset='review', inplace=True)
df.dropna(subset=['review', 'rating'], inplace=True)

# Normalize date
df['date'] = pd.to_datetime(df['date']).dt.date

# Save cleaned data
df.to_csv('cleaned_bank_reviews.csv', index=False)
df.head()


Unnamed: 0,review,rating,date,bank,source
0,የምትቆርጦት ነገር በዛች እጂ,5,2025-06-10,CBE,Google Play
1,. Reviewing content on Play is a great way to ...,5,2025-06-10,CBE,Google Play
2,So bad now and hard to use,5,2025-06-09,CBE,Google Play
3,"it is so amazing app. but, it is better to upd...",5,2025-06-09,CBE,Google Play
4,v.good app,4,2025-06-09,CBE,Google Play


In [None]:
# We Need to ensure lowercase reviews
df['review'] = df['review'].str.lower()

In [None]:
# Strip extra whitespace and punctuation
df['review'] = df['review'].str.strip().str.replace(r'\s+', ' ', regex=True)

In [None]:
# Drop short or meaningless reviews
df = df[df['review'].str.split().str.len() > 2]


In [None]:
# Confirm missing data is less than 5%
missing_pct = df.isna().mean() * 100
print("Missing % per column:\n", missing_pct)


Missing % per column:
 review    0.0
rating    0.0
date      0.0
bank      0.0
source    0.0
dtype: float64


In [None]:
# Save Updated CSV
df.to_csv('final_cleaned_reviews.csv', index=False)