In [None]:

import pandas as pd
from transformers import pipeline

file = '../data/clean/all_bank_review.csv'
df = pd.read_csv(file)
df.head()

sentiment_analysis = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def get_sentiment(text):
    try:
        result = sentiment_analysis(text[:512])[0]
        return result['label'], result['score']
    except:
        return None, None

df['sentiment'], df['confidence'] = zip(*df['review'].apply(get_sentiment))


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [None]:
df['sentiment_score'] = df['sentiment'].map({
    'POSITIVE': 1,
    'NEGATIVE': -1
})

mean_by_bank = df.groupby('bank')['sentiment_score'].mean()
print(mean_by_bank)

mean_by_rating = df.groupby('rating')['sentiment_score'].mean()
print(mean_by_rating)

In [20]:

import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


def clean_review(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

df['clean_review'] = df['review'].apply(clean_review)



bank_keywords = {}
for bank in df['bank'].unique():
    bank_df = df[df['bank'] == bank]
    
    tfidf_bank = TfidfVectorizer(max_features=50, stop_words='english', ngram_range=(1,2))
    matrix = tfidf_bank.fit_transform(bank_df['clean_review'])
    
    features = tfidf_bank.get_feature_names_out()
    scores = matrix.sum(axis=0).A1
    
    bank_keywords[bank] = pd.DataFrame({
        'keyword': features,
        'score': scores
    }).sort_values(by='score', ascending=False)
    
print(f"CBE:{bank_keywords['CBE'].head(10)}")
print(f"Dashen:{bank_keywords['Dashen'].head(10)}")
print(f"Abyssinia:{bank_keywords['Abyssinia'].head(10)}")

    


    

CBE:        keyword      score
3           app  87.804446
22         good  49.574264
9          best  32.677935
7          bank  28.255818
12          cbe  26.346086
25         like  20.631803
32         nice  20.125552
4   application  19.683541
45          use  18.643827
44       update  16.288409
Dashen:        keyword       score
3           app  110.014666
5          bank   42.190864
21         good   40.190305
11       dashen   40.135515
9          best   37.622892
40        super   30.836284
7       banking   30.289986
45          use   26.261826
2       amazing   25.146819
12  dashen bank   24.128261
Abyssinia:    keyword       score
0       app  121.027973
19     good   38.499570
4      bank   34.081896
47  working   30.371392
46     work   28.701190
7      best   25.109644
48    worst   23.055754
42   update   22.338612
5   banking   21.936554
25   mobile   21.175564


In [16]:
theme_dict_banks = {
    "Application & Performance": [
        "app", "application", "update", "work", "mobile", "mobile banking", 
        "use", "send", "fix", "problem", "working", "access", "screenshot", 
        "open", "developer"
    ],
    "User Experience & Satisfaction": [
        "good", "best", 'bad', "like", "nice", "easy", "fast", "great", "excellent",
        "amazing", "love", "best app", 'special', "easy use", "simple", "better", "using"
    ],
    "Customer Support": [
        "service", "bank", "banking", "money", "transaction", "transactions",
        "account", "telebirr", "make", "need"
    ]
}

def assign_theme(review, theme_dict_banks):
    matched_themes = []
    review = str(review).lower()
    for theme, key in theme_dict_banks.items():
        for kw in key:
            if kw in review:
                matched_themes.append(theme)
                break
    return matched_themes if matched_themes else ['Other']
    

In [17]:
df['themes'] = df['review'].apply(lambda x: assign_theme(x, theme_dict_banks))

df[['review', 'bank', 'themes']].head(20)

Unnamed: 0,review,bank,themes
0,cbe,CBE,[Other]
1,its special for me,CBE,[User Experience & Satisfaction]
2,make it user friendly,CBE,"[Application & Performance, Customer Support]"
3,maaliif daddafee install gaafata,CBE,[Other]
4,good app,CBE,"[Application & Performance, User Experience & ..."
5,this application is very important and advanta...,CBE,"[Application & Performance, Customer Support]"
6,why didnt work this app,CBE,[Application & Performance]
7,the app makes our life easier thank you cbe,CBE,"[Application & Performance, Customer Support]"
8,this app very bad,CBE,"[Application & Performance, User Experience & ..."
9,the most advanced app but how to stay safe,CBE,[Application & Performance]


In [19]:
columns_to_save = ['review', 'clean_review', 'rating', 'date', 'bank', 'source', 'sentiment', 'confidence', 'sentiment_score', 'themes']

df[columns_to_save].to_csv('../data/processed/all_bank_reviews_with_themes.csv', index=False)
