**Install packages & import libraries**

In [13]:
# Install required packages
!pip install google-generativeai newsapi-python pandas requests beautifulsoup4

# Imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
from newsapi import NewsApiClient
from google.colab import userdata
import random
import time



**Load API keys**

In [14]:
# Load API Keys from Colab Secrets
news_api_key = userdata.get("NEWS_API_KEY")
gemini_api_key = userdata.get("GEMINI_API_KEY")

if not news_api_key:
    raise ValueError("NEWS_API_KEY not found in Colab Secrets.")
if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY not found in Colab Secrets.")

# Configure Gemini client
genai.configure(api_key=gemini_api_key)
print("API keys loaded successfully")

API keys loaded successfully


**Fetch RSS feed articles**

In [15]:
def fetch_rss_feed(url, limit=50):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "xml")
        items = soup.find_all("item")[:limit]
        articles = []
        for item in items:
            title = item.title.text if item.title else None
            pubdate = item.pubDate.text if item.pubDate else None
            if title and pubdate:
                # Filter only AI/ML related
                if any(k in title.lower() for k in ["artificial intelligence", "ai", "machine learning"]):
                    articles.append({"source": "Google News RSS", "text": title, "publishedAt": pubdate})
        print(f"Fetched {len(articles)} articles from RSS")
        return articles
    except Exception as e:
        print("RSS error:", e)
        return []

rss_url = "https://news.google.com/rss/search?q=Artificial+Intelligence+Machine+Learning&hl=en-US&gl=US&ceid=US:en"
rss_data = fetch_rss_feed(rss_url, limit=50)

Fetched 48 articles from RSS


**Fetch NewsAPI articles (AI-related)**

In [16]:
def fetch_newsapi_articles(api_key, query="Artificial Intelligence OR Machine Learning", page_size=50):
    client = NewsApiClient(api_key=api_key)
    try:
        response = client.get_everything(
            q=query,
            language="en",
            sort_by="publishedAt",
            page_size=page_size
        )
        articles = response.get("articles", [])
        # Filter out articles without title or publishedAt
        articles = [a for a in articles if a.get("title") and a.get("publishedAt")]
        formatted = [{"source": a["source"]["name"], "text": a["title"], "publishedAt": a["publishedAt"]} for a in articles]
        print(f"Fetched {len(formatted)} articles from NewsAPI")
        return formatted
    except Exception as e:
        print("NewsAPI error:", e)
        return []

newsapi_data = fetch_newsapi_articles(news_api_key, page_size=50)

Fetched 49 articles from NewsAPI


**Combine sources & filter AI-related articles**

In [17]:
# Combine all sources
combined_data = rss_data + newsapi_data
df_combined = pd.DataFrame(combined_data)

# Filter only AI-related articles
ai_keywords = [
    "artificial intelligence", "ai", "machine learning", "deep learning",
    "neural network", "nlp", "computer vision", "ml", "gpt", "llm"
]

def is_ai_related(text):
    text_lower = str(text).lower()
    return any(keyword in text_lower for keyword in ai_keywords)

df_combined = df_combined[df_combined["text"].apply(is_ai_related)]
df_combined.reset_index(drop=True, inplace=True)
print(f"Filtered dataset size (AI-only): {len(df_combined)}")


Filtered dataset size (AI-only): 77


**Gemini LLM sentiment analysis (small subset)**

In [18]:
def analyze_sentiment_gemini(text):
    try:
        model = genai.GenerativeModel("gemini-2.5-flash-lite")  # lightweight Gemini model
        prompt = f"Analyze the sentiment in one word (Positive, Negative, Neutral): {text}"
        response = model.generate_content(prompt)
        sentiment = response.text.strip()
        if sentiment not in ["Positive", "Negative", "Neutral"]:
            return "Unknown"
        return sentiment
    except Exception as e:
        print("Gemini error:", e)
        return "Error"

# Apply on first 10 rows only to save API usage
df_combined.loc[:9, "llm_sentiment"] = df_combined.loc[:9, "text"].apply(analyze_sentiment_gemini)
print("Gemini sentiment added on small subset")


Gemini sentiment added on small subset


**Save final CSV**

In [19]:
# Ensure publishedAt is present; if missing, fill with "Unknown"
df_combined["publishedAt"].fillna("Unknown", inplace=True)

# Save to CSV
df_combined.to_csv("multi_source_ai_sentiment.csv", index=False)
print("CSV saved as 'multi_source_ai_sentiment.csv'")


CSV saved as 'multi_source_ai_sentiment.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_combined["publishedAt"].fillna("Unknown", inplace=True)
