In [1]:
# Imports
import pandas as pd
from textblob import TextBlob
import spacy
import os

# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_articles.csv")

# Sentiment analysis function using TextBlob
def get_sentiment(text):
    if not isinstance(text, str) or not text.strip():
        return 0.0
    return TextBlob(text).sentiment.polarity

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Entity extraction function using spaCy
def extract_entities(text):
    if not isinstance(text, str) or not text.strip():
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Apply NLP functions
df["sentiment_score"] = df["cleaned_text"].fillna("").apply(get_sentiment)
df["entities"] = df["cleaned_text"].fillna("").apply(extract_entities)

# Save analyzed dataset
os.makedirs("../data/processed", exist_ok=True)
df.to_csv("../data/processed/analyzed_articles.csv", index=False)

# Display results
df[["title", "sentiment_score", "entities"]]


Unnamed: 0,title,sentiment_score,entities
0,Kenya inflation update,0.068182,"[(kenya, GPE)]"
1,UK economy,0.166667,"[(uk, GPE)]"
2,Empty article,0.0,[]
