In [1]:
import pandas as pd
import torch
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm import tqdm
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

Device set to use cpu


In [None]:
%pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl


In [None]:
# Load spaCy
nlp = spacy.load("en_core_web_sm")
df = pd.read_csv("../../data/ethiopian_bank_reviews.csv")
def preprocess_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"http\S+|www\S+|@\S+", "", text)
    return text.strip().lower()

df['clean_text'] = df['review'].astype(str).apply(preprocess_text)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
def get_sentiment(row):
    result = sentiment_pipeline(row['clean_text'])[0]
    label = result['label']
    score = result['score']
    if label == 'NEGATIVE':
        return 'negative', score
    elif label == 'POSITIVE':
        return 'positive', score
    else:
        return 'neutral', 0.5

In [None]:
tqdm.pandas()
df[['sentiment_label', 'sentiment_score']] = df.progress_apply(lambda row: pd.Series(get_sentiment(row)), axis=1)

# Save intermediate output
df.to_csv("outputs/sentiment_output.csv", index=False)

# TF-IDF for Keyword Extraction
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_text'])

tfidf_scores = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
top_keywords = tfidf_scores.mean().sort_values(ascending=False).head(50)
print("Top Keywords:\n", top_keywords)

In [None]:
# spaCy-based keyword extraction
def extract_keywords_spacy(text):
    doc = nlp(text)
    keywords = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token) > 3]
    return keywords

df['keywords'] = df['clean_text'].progress_apply(extract_keywords_spacy)

# Manual Thematic Mapping
theme_keywords = {
    'Account Access Issues': ['login', 'account', 'password', 'access', 'block'],
    'Transaction Performance': ['transfer', 'delay', 'send', 'deposit', 'payment'],
    'User Interface & Experience': ['interface', 'design', 'layout', 'easy', 'friendly', 'ui'],
    'Customer Support': ['support', 'help', 'call', 'response', 'wait'],
    'Feature Requests': ['add', 'feature', 'option', 'update', 'notification']
}

In [None]:
def assign_themes(keywords):
    themes = []
    for theme, kws in theme_keywords.items():
        if any(kw in keywords for kw in kws):
            themes.append(theme)
    return themes if themes else ['Other']

df['identified_themes'] = df['keywords'].apply(assign_themes)

# Export Final Output
df[['review', 'bank', 'rating', 'sentiment_label', 'sentiment_score', 'identified_themes']].to_csv(
    "outputs/task2_final_analysis.csv", index=False)

print("✅ Sentiment and thematic analysis completed and saved.")