In [1]:
import os
import pandas as pd
import numpy as np
from time import time
from typing import List, Optional, Dict, Tuple

# NLP / transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# TF-IDF + clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

# Utilities
from collections import Counter


In [2]:
# Load the cleaned Task 1 dataset you created earlier
df = pd.read_csv("../data/processed/clean_reviews.csv")

# Basic check
print("Total rows:", len(df))
df.head()


Total rows: 326


Unnamed: 0,review,rating,date,bank,source,clean_review,slow,crash,login,error,bug,transfer,fail,sentiment
0,maaliif daddafee install gaafata,3,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,maaliif daddafee install gaafata,False,False,False,False,False,False,False,Neutral
1,good app,5,2025-11-28,Commercial Bank of Ethiopia,Google Play Store,good app,False,False,False,False,False,False,False,Positive
2,This application is very important and advanta...,5,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,application important advantage transfer money...,False,False,False,False,False,True,False,Positive
3,why didn't work this app?,1,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,work app,False,False,False,False,False,False,False,Negative
4,The app makes our life easier. Thank you CBE!,5,2025-11-27,Commercial Bank of Ethiopia,Google Play Store,app makes life easier thank cbe,False,False,False,False,False,False,False,Positive


In [3]:
class SentimentAnalyzer:
    def __init__(self, use_transformer=True, model_name="distilbert-base-uncased-finetuned-sst-2-english", device=-1):
        """
        If use_transformer True, attempts to load HuggingFace pipeline for sentiment.
        If fails, falls back to VADER for rule-based scores.
        device=-1 uses CPU. For GPU set device=0 (if available).
        """
        self.use_transformer = use_transformer
        self.model_name = model_name
        self.device = device
        self.transformer = None
        self.vader = None

        if self.use_transformer:
            try:
                # load transformer sentiment pipeline
                print("Loading transformer model (this may take a while)...")
                self.transformer = pipeline("sentiment-analysis", model=self.model_name, device=self.device)
                print("Transformer model loaded.")
            except Exception as e:
                print("Transformer load failed:", e)
                print("Falling back to VADER.")
                self.use_transformer = False

        if not self.use_transformer:
            self.vader = SentimentIntensityAnalyzer()
            print("VADER initialized.")

    def predict_batch(self, texts: List[str], batch_size: int = 32) -> List[Dict]:
        """
        Returns a list of dicts: {'label': 'POSITIVE'/'NEGATIVE'/'NEUTRAL', 'score': float}
        We treat low-confidence transformer outputs as NEUTRAL when score < 0.6.
        """
        results = []
        if self.transformer:
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i+batch_size]
                preds = self.transformer(batch, truncation=True)
                for p in preds:
                    label = p.get('label', 'NEUTRAL')
                    score = float(p.get('score', 0.0))
                    # treat low confidence as neutral
                    if score < 0.60:
                        results.append({'label': 'NEUTRAL', 'score': score})
                    else:
                        results.append({'label': label, 'score': score})
        else:
            for t in texts:
                vs = self.vader.polarity_scores(str(t))
                # compound in [-1,1]
                comp = vs['compound']
                if comp >= 0.05:
                    label = 'POSITIVE'
                elif comp <= -0.05:
                    label = 'NEGATIVE'
                else:
                    label = 'NEUTRAL'
                results.append({'label': label, 'score': comp})
        return results

    def annotate_df(self, df: pd.DataFrame, text_col: str = 'review', out_label_col='sentiment_label', out_score_col='sentiment_score'):
        texts = df[text_col].fillna("").astype(str).tolist()
        preds = self.predict_batch(texts)
        labels = [p['label'] for p in preds]
        scores = [p['score'] for p in preds]
        df[out_label_col] = labels
        df[out_score_col] = scores
        return df


In [4]:
# Initialize: set use_transformer=True to use DistilBERT (recommended)
sent_analyzer = SentimentAnalyzer(use_transformer=True, device=-1)

# Annotate df (this will take time on CPU for many rows)
t0 = time()
df = sent_analyzer.annotate_df(df, text_col='review', out_label_col='sentiment_label', out_score_col='sentiment_score')
t1 = time()
print(f"Annotated {len(df)} rows in {t1-t0:.1f}s")

# KPI: percent annotated
annotated_pct = df['sentiment_label'].notna().mean() * 100
print(f"Percent with sentiment label: {annotated_pct:.1f}%")


Loading transformer model (this may take a while)...


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  39%|###9      | 105M/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Transformer model loaded.
Annotated 326 rows in 8.0s
Percent with sentiment label: 100.0%


In [5]:
# Ensure rating numeric
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Mean sentiment score by bank
sent_by_bank = df.groupby('bank')['sentiment_score'].mean().sort_values(ascending=False)
print("Mean sentiment score by bank:")
print(sent_by_bank)

# Mean sentiment by bank & rating
agg = df.groupby(['bank','rating'])['sentiment_score'].mean().unstack(fill_value=np.nan)
agg


Mean sentiment score by bank:
bank
Commercial Bank of Ethiopia    0.962261
Name: sentiment_score, dtype: float64


rating,1,2,3,4,5
bank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Commercial Bank of Ethiopia,0.965377,0.99517,0.980915,0.982131,0.954599


In [6]:
class ThematicAnalyzer:
    def __init__(self, ngram_range=(1,2), max_features=2000):
        self.ngram_range = ngram_range
        self.max_features = max_features
        self.vectorizer = TfidfVectorizer(ngram_range=self.ngram_range, max_features=self.max_features, stop_words='english')
        self.tfidf_matrix = None
        self.feature_names = None
        self.kmeans = None
        self.labels = None

    def fit_tfidf(self, texts: List[str]):
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        self.feature_names = np.array(self.vectorizer.get_feature_names_out())
        return self

    def top_n_terms_for_doc(self, doc_idx: int, n=10):
        row = self.tfidf_matrix[doc_idx]
        # get nonzero indices and sort desc by tfidf
        coo = row.tocoo()
        tuples = sorted(zip(coo.col, coo.data), key=lambda x: x[1], reverse=True)[:n]
        return [(self.feature_names[idx], float(score)) for idx, score in tuples]

    def global_top_n(self, n=30):
        # average TF-IDF across docs, then top features
        avg = np.asarray(self.tfidf_matrix.mean(axis=0)).ravel()
        top_idx = np.argsort(avg)[::-1][:n]
        return [(self.feature_names[i], float(avg[i])) for i in top_idx]

    def cluster_documents(self, n_clusters=5, reduce_dim=50, random_state=42):
        # optional SVD to speed up clustering
        if reduce_dim and self.tfidf_matrix.shape[1] > reduce_dim:
            svd = TruncatedSVD(n_components=reduce_dim, random_state=random_state)
            reduced = svd.fit_transform(self.tfidf_matrix)
        else:
            reduced = self.tfidf_matrix
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        self.labels = self.kmeans.fit_predict(reduced)
        return self.labels

    def top_terms_per_cluster(self, n_terms=10):
        if self.kmeans is None:
            raise RuntimeError("Call cluster_documents first.")
        order_centroids = self.kmeans.cluster_centers_.argsort()[:, ::-1]
        terms = []
        for i in range(self.kmeans.n_clusters):
            top_terms = [self.feature_names[ind] for ind in order_centroids[i, :n_terms]]
            terms.append((i, top_terms))
        return terms


In [7]:
thematic = ThematicAnalyzer(ngram_range=(1,2), max_features=3000)

# We'll analyze per-bank so themes are bank-specific
bank_themes = {}
for bank in df['bank'].unique():
    texts = df[df['bank'] == bank]['clean_review'].fillna("").astype(str).tolist()
    if len(texts) == 0:
        bank_themes[bank] = {'top_terms': [], 'clusters': None}
        continue
    thematic.fit_tfidf(texts)
    top_terms = thematic.global_top_n(n=50)
    # Save top single/bi-grams
    bank_themes[bank] = {'top_terms': top_terms}
    print(f"\nBank: {bank} — top terms:")
    for term, score in top_terms[:20]:
        print(term, end=", ")
    print()



Bank: Commercial Bank of Ethiopia — top terms:
app, good, best, nice, cbe, like, bank, application, good app, best app, love, excellent, update, nice app, bad, use, banking, apps, wow, useful, 


In [9]:
df['clean_review'] = df['clean_review'].fillna("")


In [10]:
# Example theme keywords — adapt after inspecting top_terms above
theme_keyword_map = {
    'Account Access Issues': ['login', 'otp', 'password', 'biometric', 'fingerprint', 'signin', 'sign in'],
    'Transaction Performance': ['slow', 'transfer', 'timeout', 'processing', 'delay', 'transaction', 'payment'],
    'Reliability & Crashes': ['crash', 'freeze', 'stuck', 'error', 'bug', 'fail', 'failed'],
    'User Interface & Experience': ['ui', 'design', 'menu', 'navigation', 'layout', 'app interface', 'easy to use'],
    'Customer Support': ['support', 'customer service', 'help', 'agent', 'response']
}

def assign_themes(text: str, theme_map: Dict[str, List[str]]) -> List[str]:
    text_l = text.lower()
    matched = []
    for theme, keywords in theme_map.items():
        for kw in keywords:
            if kw in text_l:
                matched.append(theme)
                break
    if not matched:
        matched = ['Other']
    return matched

# Apply theme assignment (creates a list column and a first_theme column)
df['identified_themes'] = df['clean_review'].apply(lambda t: assign_themes(t, theme_keyword_map))
df['primary_theme'] = df['identified_themes'].apply(lambda lst: lst[0] if isinstance(lst, list) and len(lst)>0 else 'Other')

# Quick counts per bank
for bank in df['bank'].unique():
    print("\nBank:", bank)
    print(df[df['bank']==bank]['primary_theme'].value_counts().head(10))



Bank: Commercial Bank of Ethiopia
primary_theme
Other                          291
Transaction Performance         22
Account Access Issues            6
Customer Support                 3
Reliability & Crashes            2
User Interface & Experience      2
Name: count, dtype: int64


In [11]:
# For one bank (example: CBE)
bank_name = df['bank'].unique()[0]  # change as needed
texts = df[df['bank']==bank_name]['clean_review'].fillna("").astype(str).tolist()
if len(texts) >= 50:
    thematic.fit_tfidf(texts)
    labels = thematic.cluster_documents(n_clusters=5, reduce_dim=50)
    terms_per_cluster = thematic.top_terms_per_cluster(n_terms=15)
    print("Top terms per cluster:")
    for cid, terms in terms_per_cluster:
        print(f"Cluster {cid}:", ", ".join(terms[:10]))
else:
    print("Not enough docs to cluster for", bank_name)


Top terms per cluster:
Cluster 0: abebaw zenebe, abraahim, abebaw, abdulakim, abroad countries, abaoli, access app, absolute trash, absolutely right, abdulakim abrahim
Cluster 1: abaoli, abdulakim abrahim, able, abebaw, abdulakim, absolute, able send, abraahim, abrahim, access properly
Cluster 2: abfixa, abebaw zenebe, abebaw, abdulakim abrahim, abdulakim, absolute trash, access txn, abrahim elemoo, able appear, abaoli
Cluster 3: abdulakim, abaoli, absolutely excellent, absolute trash, abfixa, able transfer, active app, absolutely right, able appear, accounts single
Cluster 4: able transfer, abrahim elemoo, abaoli, abebaw, abdulakim abrahim, absolutely right, abdulakim, absolutely excellent, access properly, absolute trash


In [12]:
# Save columns suggested in Task 2:
# review_id (create index), review_text, sentiment_label, sentiment_score, identified_theme(s), primary_theme, bank, rating, date

df_out = df.copy().reset_index().rename(columns={'index':'review_id'})
df_out = df_out[['review_id','review','clean_review','rating','sentiment_label','sentiment_score','identified_themes','primary_theme','bank','date']]
os.makedirs("data/processed", exist_ok=True)
out_path = "data/processed/reviews_with_sentiment_theme.csv"
df_out.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: data/processed/reviews_with_sentiment_theme.csv


In [13]:
# KPI: percent of rows with sentiment scores
pct_sentiment = df_out['sentiment_label'].notna().mean() * 100
print(f"Sentiment labeled for: {pct_sentiment:.1f}% of reviews (goal: >= 90%)")

# KPI: themes per bank
for bank in df_out['bank'].unique():
    themes = df_out[df_out['bank']==bank]['primary_theme'].value_counts().head(10)
    print("\nBank:", bank)
    print(themes)

# Example: show 3 example reviews per detected theme for one bank
bank = df_out['bank'].unique()[0]
for theme in df_out[df_out['bank']==bank]['primary_theme'].unique()[:5]:
    print(f"\nExamples for {theme}:")
    examples = df_out[(df_out['bank']==bank) & (df_out['primary_theme']==theme)]['review'].head(3).tolist()
    for ex in examples:
        print("-", ex)


Sentiment labeled for: 100.0% of reviews (goal: >= 90%)

Bank: Commercial Bank of Ethiopia
primary_theme
Other                          291
Transaction Performance         22
Account Access Issues            6
Customer Support                 3
Reliability & Crashes            2
User Interface & Experience      2
Name: count, dtype: int64

Examples for Other:
- maaliif daddafee install gaafata
- good app
- why didn't work this app?

Examples for Transaction Performance:
- This application is very important and advantage for transfer of money and finance in the coutry and foriegn country.
- not allowing to transfer and showing current statement updates.
- I am not able to transfer. The app is not responding

Examples for Customer Support:
- thanks blc this app help me to use all time
- sync problem may 22 2025 but the date stack on may 8 2025 help pls
- I love this app b/c every option in the app very clear and supportive.

Examples for Account Access Issues:
- ጥሩ App ነዉ ። የኔ የግል አስተያየት