📊 Step 1: Preprocess Text for Thematic Analysis

In [7]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Load sentiment dataset
df = pd.read_csv(r"C:\10 Kifia Tasks\Week-2\Customer-Experience-Analytics-for-Fintech-Apps\notebooks\Sentiment_Analysis\CBE_sentiment_analysis.csv")

# Preprocess text (lemmatization & stopword removal)
df["clean_review"] = df["review"].apply(lambda x: " ".join([token.lemma_ for token in nlp(str(x)) if not token.is_stop]))

# Extract top keywords using TF-IDF
vectorizer = TfidfVectorizer(max_features=50)
tfidf_matrix = vectorizer.fit_transform(df["clean_review"])
keywords = vectorizer.get_feature_names_out()

print("🔹 Top extracted keywords:", keywords)


🔹 Top extracted keywords: ['access' 'account' 'add' 'allow' 'amazing' 'app' 'application' 'bad'
 'bank' 'banking' 'cbe' 'crash' 'developer' 'easy' 'ethiopia' 'excellent'
 'fast' 'fix' 'good' 'great' 'issue' 'like' 'make' 'mobile' 'money' 'need'
 'network' 'nice' 'option' 'problem' 'reliable' 'screenshot' 'send'
 'service' 'simple' 'take' 'thank' 'time' 'transaction' 'transfer'
 'update' 'use' 'user' 'well' 'work' 'wow' 'በጣም' 'ነው' 'ግን' 'ጥሩ']


Step 2: Manual/Rule-Based Thematic Clustering

In [8]:
# Define rule-based keyword groups
themes = {
    "Account Access Issues": ["login", "password", "authentication", "error", "access"],
    "Transaction Performance": ["transfer", "delay", "slow", "processing", "speed"],
    "User Interface & Experience": ["UI", "design", "navigation", "intuitive", "easy"],
    "Customer Support": ["support", "help", "response", "service"],
    "Feature Requests": ["fingerprint", "notification", "new features", "budgeting"]
}

# Assign themes to reviews
def categorize_theme(review):
    matched_themes = [theme for theme, keywords in themes.items() if any(word in review for word in keywords)]
    return matched_themes if matched_themes else ["Other"]

df["identified_theme"] = df["clean_review"].apply(categorize_theme)
df.to_csv("CBE_thematic_analysis.csv", index=False)

print("✅ Thematic clustering completed! Results saved in CBE_thematic_analysis.csv.")


✅ Thematic clustering completed! Results saved in CBE_thematic_analysis.csv.
