In [35]:
# Import necessary libraries
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [37]:
# Load the cleaned reviews data (assuming this is the cleaned file)
df_cbe_cleaned = pd.read_csv('../data/cleaned_reviews_CBE.csv')

# Display the first few rows to confirm the data
df_cbe_cleaned.head()


Unnamed: 0,review,rating,date,bank,source
0,bayeegar,5,2025-08-15,CBE,Google Play
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,2,2025-08-15,CBE,Google Play
2,congra ethio,5,2025-08-15,CBE,Google Play
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,5,2025-08-15,CBE,Google Play
4,nice,3,2025-08-14,CBE,Google Play


In [38]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis to the review column
df_cbe_cleaned['vader_sentiment'] = df_cbe_cleaned['review'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Classify sentiment into positive, negative, and neutral
df_cbe_cleaned['vader_sentiment_label'] = df_cbe_cleaned['vader_sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Display the sentiment results
df_cbe_cleaned[['review', 'vader_sentiment', 'vader_sentiment_label']].head()


Unnamed: 0,review,vader_sentiment,vader_sentiment_label
0,bayeegar,0.0,neutral
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,0.3182,positive
2,congra ethio,0.0,neutral
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,0.0,neutral
4,nice,0.4215,positive


In [39]:
# Apply TextBlob sentiment analysis to the review column
df_cbe_cleaned['textblob_sentiment'] = df_cbe_cleaned['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Classify sentiment into positive, negative, and neutral
df_cbe_cleaned['textblob_sentiment_label'] = df_cbe_cleaned['textblob_sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Display the sentiment results
df_cbe_cleaned[['review', 'textblob_sentiment', 'textblob_sentiment_label']].head()


Unnamed: 0,review,textblob_sentiment,textblob_sentiment_label
0,bayeegar,0.0,neutral
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,0.0,neutral
2,congra ethio,0.0,neutral
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,0.0,neutral
4,nice,0.6,positive


In [40]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=50)

# Apply TF-IDF to the reviews
X = vectorizer.fit_transform(df_cbe_cleaned['review'])

# Convert the results into a DataFrame for easier inspection
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the top 10 keywords for theme extraction
tfidf_df.sum().sort_values(ascending=False).head(10)


app         45.604401
good        32.568124
best        20.856723
bank        13.638972
nice        13.176903
cbe         12.637678
use         11.893904
like         9.982036
easy         9.252986
good app     8.788228
dtype: float64

In [41]:
# Initialize KMeans clustering with 3 clusters (themes)
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model to the TF-IDF features
kmeans.fit(X)

# Get the cluster centers (which represent the theme keywords)
centroids = kmeans.cluster_centers_

# Get the top keywords for each cluster (theme)
terms = vectorizer.get_feature_names_out()
themes = {}
for i in range(num_clusters):
    theme_keywords = [terms[ind] for ind in centroids[i].argsort()[-10:]]
    themes[f'Theme {i+1}'] = theme_keywords

# Display the themes and their top keywords
themes


{'Theme 1': ['add',
  'services',
  'cbe',
  'bank',
  'service',
  'time',
  'make',
  'app',
  'good app',
  'good'],
 'Theme 2': ['mobile',
  'banking',
  'application',
  'easy',
  'like',
  'use',
  'cbe',
  'bank',
  'best',
  'app'],
 'Theme 3': ['simple',
  'ነው',
  'በጣም',
  'use',
  'bank',
  'fast',
  'apps',
  'app',
  'nice app',
  'nice']}

In [42]:
# Initialize KMeans clustering with 3 clusters (themes)
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model to the TF-IDF features
kmeans.fit(X)

# Get the cluster centers (which represent the theme keywords)
centroids = kmeans.cluster_centers_

# Get the top keywords for each cluster (theme)
terms = vectorizer.get_feature_names_out()
themes = {}
for i in range(num_clusters):
    theme_keywords = [terms[ind] for ind in centroids[i].argsort()[-10:]]
    themes[f'Theme {i+1}'] = theme_keywords

# Display the themes and their top keywords
themes


{'Theme 1': ['add',
  'services',
  'cbe',
  'bank',
  'service',
  'time',
  'make',
  'app',
  'good app',
  'good'],
 'Theme 2': ['mobile',
  'banking',
  'application',
  'easy',
  'like',
  'use',
  'cbe',
  'bank',
  'best',
  'app'],
 'Theme 3': ['simple',
  'ነው',
  'በጣም',
  'use',
  'bank',
  'fast',
  'apps',
  'app',
  'nice app',
  'nice']}

In [43]:
# Function to assign theme based on similarity to cluster centroids
def assign_theme(review, kmeans, vectorizer):
    # Transform the review to TF-IDF features
    review_tfidf = vectorizer.transform([review])
    
    # Predict the cluster for the review
    cluster = kmeans.predict(review_tfidf)[0]
    
    return f'Theme {cluster+1}'

# Assign themes to each review
df_cbe_cleaned['theme'] = df_cbe_cleaned['review'].apply(lambda x: assign_theme(x, kmeans, vectorizer))

# Display the first few rows to confirm themes
df_cbe_cleaned[['review', 'vader_sentiment_label', 'textblob_sentiment_label', 'theme']].head()


Unnamed: 0,review,vader_sentiment_label,textblob_sentiment_label,theme
0,bayeegar,neutral,neutral,Theme 2
1,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,positive,neutral,Theme 2
2,congra ethio,neutral,neutral,Theme 2
3,እንዲሁ ነው የሰተኋችሁ ለሞራል,neutral,neutral,Theme 2
4,nice,positive,positive,Theme 3


In [46]:
# Save the updated dataframe to a new CSV file
df_cbe_cleaned.to_csv('../data/processed_reviews_with_sentiment_themes.csv', index=False)

# Confirmation message
print("Data with sentiment and themes saved to 'processed_reviews_with_sentiment_themes.csv'")


Data with sentiment and themes saved to 'processed_reviews_with_sentiment_themes.csv'
