In [47]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [56]:
import pandas as pd

# Load cleaned reviews for all three banks
df_cbe = pd.read_csv('../data/cleaned_reviews_CBE.csv')
df_boa = pd.read_csv('../data/cleaned_reviews_BOA.csv')
df_dashen = pd.read_csv('../data/cleaned_reviews_Dashen.csv')

# Concatenate all dataframes into one combined dataframe
df_reviews = pd.concat([df_cbe, df_boa, df_dashen], ignore_index=True)

# Check the combined dataset
df_reviews.head()


Unnamed: 0,review,rating,date,bank,source
0,bayeegar,5,2025-08-15,CBE,Google Play
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,2,2025-08-15,CBE,Google Play
2,congra ethio,5,2025-08-15,CBE,Google Play
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,5,2025-08-15,CBE,Google Play
4,nice,3,2025-08-14,CBE,Google Play


In [57]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis
df_reviews['vader_sentiment'] = df_reviews['review'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df_reviews['vader_sentiment_label'] = df_reviews['vader_sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Apply TextBlob sentiment analysis
df_reviews['textblob_sentiment'] = df_reviews['review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_reviews['textblob_sentiment_label'] = df_reviews['textblob_sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Display the sentiment results
df_reviews[['review', 'vader_sentiment', 'vader_sentiment_label', 'textblob_sentiment', 'textblob_sentiment_label']].head()


Unnamed: 0,review,vader_sentiment,vader_sentiment_label,textblob_sentiment,textblob_sentiment_label
0,bayeegar,0.0,neutral,0.0,neutral
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,0.3182,positive,0.0,neutral
2,congra ethio,0.0,neutral,0.0,neutral
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,0.0,neutral,0.0,neutral
4,nice,0.4215,positive,0.6,positive


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50)
X = vectorizer.fit_transform(df_reviews['review'])

# Fit KMeans clustering model with 3 clusters (Themes)
num_clusters = 3  # You can adjust this to 4 or 5 based on the clustering results
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Get top 10 words for each cluster (theme)
centroids = kmeans.cluster_centers_
terms = vectorizer.get_feature_names_out()
themes = {}
for i in range(num_clusters):
    theme_keywords = [terms[ind] for ind in centroids[i].argsort()[-10:]]
    themes[f'Theme {i+1}'] = theme_keywords

# Display the themes
themes


{'Theme 1': ['experience',
  'good',
  'great',
  'banking',
  'bank',
  'mobile',
  'use',
  'app',
  'boa',
  'time'],
 'Theme 2': ['amazing',
  'like',
  'banking',
  'application',
  'use',
  'nice',
  'bank',
  'best',
  'good',
  'app'],
 'Theme 3': ['easy',
  'experience',
  'fast',
  'banking',
  'super app',
  'bank',
  'app',
  'dashen bank',
  'super',
  'dashen']}

In [59]:
# Function to assign theme based on cluster prediction
def assign_theme(review, kmeans, vectorizer):
    # Transform the review to TF-IDF features
    review_tfidf = vectorizer.transform([review])
    
    # Predict the cluster for the review
    cluster = kmeans.predict(review_tfidf)[0]
    
    return f'Theme {cluster+1}'

# Apply the function to assign themes to reviews
df_reviews['theme'] = df_reviews['review'].apply(lambda x: assign_theme(x, kmeans, vectorizer))

# Display the first few rows to confirm
df_reviews[['review', 'vader_sentiment_label', 'textblob_sentiment_label', 'theme']].head()


Unnamed: 0,review,vader_sentiment_label,textblob_sentiment_label,theme
0,bayeegar,neutral,neutral,Theme 2
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,positive,neutral,Theme 2
2,congra ethio,neutral,neutral,Theme 2
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,neutral,neutral,Theme 2
4,nice,positive,positive,Theme 2


In [60]:
# Save the processed data to a CSV file for further analysis or submission
df_reviews.to_csv('../data/processed_reviews_all_banks.csv', index=False)

# Confirm the data is saved
df_reviews.head()


Unnamed: 0,review,rating,date,bank,source,vader_sentiment,vader_sentiment_label,textblob_sentiment,textblob_sentiment_label,theme
0,bayeegar,5,2025-08-15,CBE,Google Play,0.0,neutral,0.0,neutral,Theme 2
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,2,2025-08-15,CBE,Google Play,0.3182,positive,0.0,neutral,Theme 2
2,congra ethio,5,2025-08-15,CBE,Google Play,0.0,neutral,0.0,neutral,Theme 2
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,5,2025-08-15,CBE,Google Play,0.0,neutral,0.0,neutral,Theme 2
4,nice,3,2025-08-14,CBE,Google Play,0.4215,positive,0.6,positive,Theme 2


In [62]:
# VADER Sentiment Distribution Count
vader_dist = df_reviews['vader_sentiment_label'].value_counts()

# TextBlob Sentiment Distribution Count
textblob_dist = df_reviews['textblob_sentiment_label'].value_counts()

# Print results for easy copy-paste
print("VADER Sentiment Distribution:")
print(vader_dist)
print("\nTextBlob Sentiment Distribution:")
print(textblob_dist)


VADER Sentiment Distribution:
vader_sentiment_label
positive    592
neutral     265
negative    152
Name: count, dtype: int64

TextBlob Sentiment Distribution:
textblob_sentiment_label
positive    579
neutral     300
negative    130
Name: count, dtype: int64


In [63]:
# VADER sentiment mean by rating
vader_rating_corr = df_reviews.groupby('rating')['vader_sentiment'].mean()

# TextBlob sentiment mean by rating
textblob_rating_corr = df_reviews.groupby('rating')['textblob_sentiment'].mean()

# Print the results for easy copy-paste
print("VADER Sentiment Mean by Rating:")
print(vader_rating_corr)
print("\nTextBlob Sentiment Mean by Rating:")
print(textblob_rating_corr)


VADER Sentiment Mean by Rating:
rating
1   -0.152215
2    0.004819
3    0.239953
4    0.318166
5    0.447753
Name: vader_sentiment, dtype: float64

TextBlob Sentiment Mean by Rating:
rating
1   -0.120020
2    0.059876
3    0.177190
4    0.320118
5    0.411831
Name: textblob_sentiment, dtype: float64
