In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from textblob import TextBlob

# Function to fetch news data from NewsAPI for a specific date
def fetch_news_for_date(api_key, query, date):
    url = (f"https://newsapi.org/v2/everything?"
           f"q={query}&from={date}&to={date}&"
           f"sortBy=publishedAt&apiKey={api_key}&pageSize=100&page=1")
    response = requests.get(url)
    return response.json()

# Preprocessing function
def preprocess_news(articles):
    # Convert to DataFrame
    news_df = pd.DataFrame(articles)
    
    # Convert 'publishedAt' to datetime
    news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt'])
    
    # Remove duplicates
    news_df = news_df.drop_duplicates(subset='url')
    
    # Select relevant columns
    news_df = news_df[['publishedAt', 'title', 'description', 'content', 'url']]
    
    # Handle missing values (for simplicity, we'll drop them)
    news_df = news_df.dropna(subset=['title', 'description', 'content'])
    
    return news_df

# Function to perform sentiment analysis
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Function to fetch and aggregate sentiment scores for a date range
def fetch_and_aggregate_sentiment(api_key, query, start_date, end_date):
    current_date = start_date
    all_aggregate_sentiments = []

    while current_date <= end_date:
        news_data = fetch_news_for_date(api_key, query, current_date.strftime('%Y-%m-%d'))
        
        if 'articles' in news_data:
            articles = news_data['articles']
            
            # Preprocess the fetched news articles
            news_df = preprocess_news(articles)
            
            if not news_df.empty:
                # Add sentiment analysis
                news_df['title_sentiment'] = news_df['title'].apply(analyze_sentiment)
                news_df['description_sentiment'] = news_df['description'].apply(analyze_sentiment)
                news_df['content_sentiment'] = news_df['content'].apply(analyze_sentiment)

                # Aggregate sentiment analysis
                aggregate_sentiment = {
                    'date': current_date.strftime('%Y-%m-%d'),
                    'mean_title_sentiment': news_df['title_sentiment'].mean(),
                    'mean_description_sentiment': news_df['description_sentiment'].mean(),
                    'mean_content_sentiment': news_df['content_sentiment'].mean()
                }
            else:
                aggregate_sentiment = {
                    'date': current_date.strftime('%Y-%m-%d'),
                    'mean_title_sentiment': None,
                    'mean_description_sentiment': None,
                    'mean_content_sentiment': None
                }
        else:
            print(f"No articles found for {current_date.strftime('%Y-%m-%d')} or API error.")
            aggregate_sentiment = {
                'date': current_date.strftime('%Y-%m-%d'),
                'mean_title_sentiment': None,
                'mean_description_sentiment': None,
                'mean_content_sentiment': None
            }
        
        all_aggregate_sentiments.append(aggregate_sentiment)
        current_date += timedelta(days=1)

    return pd.DataFrame(all_aggregate_sentiments)

# Fetch news data and aggregate sentiment scores for the specified date range
api_key = 'd20bdbb6e7dc4cc098d1437f594d4721'
query = 'Apple Inc'
start_date = datetime.strptime('2024-06-30', '%Y-%m-%d')
end_date = datetime.strptime('2024-07-29', '%Y-%m-%d')

aggregated_sentiments_df = fetch_and_aggregate_sentiment(api_key, query, start_date, end_date)

# Display the aggregated sentiment DataFrame
aggregated_sentiments_df


Unnamed: 0,date,mean_title_sentiment,mean_description_sentiment,mean_content_sentiment
0,2024-06-30,0.060905,0.16708,0.203335
1,2024-07-01,0.050551,0.108498,0.128657
2,2024-07-02,0.101975,0.09985,0.117265
3,2024-07-03,0.065679,0.104859,0.126425
4,2024-07-04,0.025688,0.034645,0.080153
5,2024-07-05,0.04709,0.140437,0.151767
6,2024-07-06,0.018223,0.166462,0.20358
7,2024-07-07,-0.028261,0.23179,0.160399
8,2024-07-08,0.010167,0.088494,0.084628
9,2024-07-09,0.019167,0.088471,0.110812
