In [5]:
import pandas as pd
import re


In [6]:
df = pd.read_csv("../data/processed/reviews_cleaned.csv")

print("Rows:", len(df))
df.head()


Rows: 1200


Unnamed: 0,review,rating,date,bank,source
0,‚ù§,5,2025-12-12 16:03:43+00:00,CBE,Google Play
1,its a goos app,5,2025-12-12 12:04:31+00:00,CBE,Google Play
2,Super-easy to use,5,2025-12-12 11:34:24+00:00,CBE,Google Play
3,best,5,2025-12-12 11:29:12+00:00,CBE,Google Play
4,It's great app but Please add(included) water ...,4,2025-12-12 11:05:04+00:00,CBE,Google Play


In [7]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text


In [8]:
df["clean_review"] = df["review"].apply(clean_text)

df[["review", "clean_review"]].head()


Unnamed: 0,review,clean_review
0,‚ù§,
1,its a goos app,its a goos app
2,Super-easy to use,supereasy to use
3,best,best
4,It's great app but Please add(included) water ...,its great app but please addincluded water bil...


In [10]:
from transformers import pipeline

sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [11]:
sentiments = sentiment_model(
    df["review"].fillna("").tolist(),
    truncation=True
)

df["sentiment_label"] = [s["label"] for s in sentiments]
df["sentiment_score"] = [s["score"] for s in sentiments]

df[["review", "sentiment_label", "sentiment_score"]].head()


Unnamed: 0,review,sentiment_label,sentiment_score
0,‚ù§,NEGATIVE,0.697057
1,its a goos app,NEGATIVE,0.984296
2,Super-easy to use,POSITIVE,0.997118
3,best,POSITIVE,0.999794
4,It's great app but Please add(included) water ...,NEGATIVE,0.903313


In [19]:
THEMES = {
    "Account Access": ["login", "password", "otp", "verification", "signin"],
    "Performance": ["slow", "crash", "freeze", "loading", "lag", "stuck"],
    "UI/UX": ["ui", "design", "interface", "navigation", "layout", "screen"],
    "Features": ["transfer", "payment", "bill", "feature", "fingerprint"],
    "Support": ["support", "service", "help", "call", "customer"]
}


In [20]:
def assign_theme(text):
    text = str(text).lower()
    for theme, keywords in THEMES.items():
        if any(k in text for k in keywords):
            return theme
    return "General"


In [21]:
df["theme"] = df["review"].apply(assign_theme)

df[["review", "theme"]].tail(10)


Unnamed: 0,review,theme
1190,The Dashen Super App is very impressive. It is...,UI/UX
1191,betam kerfafa naw,General
1192,exceptional,General
1193,worst app ever,General
1194,"Dashen, always two step ahead.",General
1195,Please try to fix the barrier between other ba...,General
1196,üëç,General
1197,i am using,General
1198,It is a wonderful App developed by a local tal...,Features
1199,wonderful app,General


In [25]:
df.to_csv("../data/processed/analyzed_reviews.csv", index=False)

print("Saved analyzed_reviews.csv")


Saved analyzed_reviews.csv


In [23]:
df.groupby("bank")["sentiment_label"].value_counts(normalize=True)


bank    sentiment_label
BOA     NEGATIVE           0.56
        POSITIVE           0.44
CBE     POSITIVE           0.66
        NEGATIVE           0.34
Dashen  POSITIVE           0.66
        NEGATIVE           0.34
Name: proportion, dtype: float64

In [24]:
df["theme"].value_counts()


theme
General           1008
Performance         52
UI/UX               44
Features            40
Support             40
Account Access      16
Name: count, dtype: int64