In [47]:
import pandas as pd
import requests
import concurrent.futures
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import yfinance as yf
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
import mplfinance as mpf
import nltk

nltk.download('vader_lexicon')


API_KEY_GUARDIAN = "b001c2f9-9358-4483-a3bc-da632b995c95"
NEWS_API_KEY = "0bbb8dd5a0754e0fb14d9364e924d9e0"
SHELL_TICKER = 'SHEL'
BP_TICKER = 'BP'

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()


def fetch_data_guardian(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

def fetch_article_content_guardian(api_url):
    try:
        response = requests.get(api_url + "?api-key=" + API_KEY_GUARDIAN + "&show-fields=body,trailText")
        response.raise_for_status()
        data = response.json()
        content = data['response']['content']['fields'].get('body', 'No content available')
        description = data['response']['content']['fields'].get('trailText', 'No description available')
        return content, description
    except requests.exceptions.RequestException as e:
        print(f"Error fetching article content from {api_url}: {e}")
        return None, None
    except KeyError as e:
        print(f"Error parsing article content: {e}")
        return None, None

def extract_info_guardian(json_data):
    if not json_data:
        return []
    try:
        articles = json_data['response']['results']
        return [
            {
                'title': article['webTitle'],
                'sectionname': article['sectionName'],
                'publisheddate': article['webPublicationDate'],
                'api_url': article['apiUrl']
            } for article in articles
        ]
    except KeyError as e:
        print(f"Error parsing data: {e}")
        return []


def analyze_sentiment_vader(text):
    vs = analyzer.polarity_scores(text)
    return vs['compound']


def fetch_guardian_news():
    urllist = []
    for i in range(1, 17):
        base_url = "https://content.guardianapis.com/business/oil?from-date=2023-01-01&api-key=" + API_KEY_GUARDIAN + "&type=article&page="
        url = base_url + str(i)
        urllist.append(url)

    urls = urllist
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        json_data_list = list(executor.map(fetch_data_guardian, urls))
    
    info = []
    for json_data in json_data_list:
        info.extend(extract_info_guardian(json_data))
    
    if info:
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            content_and_descriptions = list(executor.map(fetch_article_content_guardian, [article['api_url'] for article in info]))
        
        for i, (content, description) in enumerate(content_and_descriptions):
            if content and description:
                info[i]['content'] = content
                info[i]['description'] = description
            else:
                info[i]['content'] = 'No content available'
                info[i]['description'] = 'No description available'
            info[i].pop('api_url')  # Remove api_url from the final data

        df = pd.DataFrame(info)

        # Perform sentiment analysis on title, description, and content
        df['title_sentiment'] = df['title'].map(analyze_sentiment_vader)
        df['description_sentiment'] = df['description'].map(analyze_sentiment_vader)
        df['content_sentiment'] = df['content'].map(analyze_sentiment_vader)
        
        # Convert publisheddate to datetime format and then to date
        df['publisheddate'] = pd.to_datetime(df['publisheddate']).dt.date
        
        # Select only numeric columns for grouping
        sentiment_columns = ['title_sentiment', 'description_sentiment', 'content_sentiment']
        
        # Group by date and calculate the mean of sentiment scores
        sentiment_df = df.groupby('publisheddate')[sentiment_columns].mean().reset_index()

        # Rename 'publisheddate' to 'date'
        sentiment_df.rename(columns={'publisheddate': 'date'}, inplace=True)

        # Save the sentiment analysis DataFrame to a CSV file
        sentiment_df.to_csv('guardian_oil_articles_vader_sentiment.csv', index=False)

        return sentiment_df
    else:
        print("No data found")
        return pd.DataFrame()


def fetch_news_for_date(api_key, query, date):
    url = (f"https://newsapi.org/v2/everything?"
           f"q={query}&from={date}&to={date}&"
           f"sortBy=publishedAt&apiKey={api_key}&pageSize=100&page=1")
    response = requests.get(url)
    return response.json()

def preprocess_news(articles):
    news_df = pd.DataFrame(articles)
    news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt'])
    news_df = news_df.drop_duplicates(subset='url')
    news_df = news_df[['publishedAt', 'title', 'description', 'content', 'url']]
    news_df = news_df.dropna(subset=['title', 'description', 'content'])
    return news_df

def fetch_and_aggregate_sentiment(api_key, query, start_date, end_date):
    current_date = start_date
    all_aggregate_sentiments = []

    while current_date <= end_date:
        news_data = fetch_news_for_date(api_key, query, current_date.strftime('%Y-%m-%d'))
        
        if 'articles' in news_data:
            articles = news_data['articles']
            news_df = preprocess_news(articles)
            
            if not news_df.empty:
                news_df['title_sentiment'] = news_df['title'].apply(analyze_sentiment_vader)
                news_df['description_sentiment'] = news_df['description'].apply(analyze_sentiment_vader)
                news_df['content_sentiment'] = news_df['content'].apply(analyze_sentiment_vader)

                aggregate_sentiment = {
                    'date': current_date.strftime('%Y-%m-%d'),
                    'mean_title_sentiment': news_df['title_sentiment'].mean(),
                    'mean_description_sentiment': news_df['description_sentiment'].mean(),
                    'mean_content_sentiment': news_df['content_sentiment'].mean()
                }
            else:
                aggregate_sentiment = {
                    'date': current_date.strftime('%Y-%m-%d'),
                    'mean_title_sentiment': None,
                    'mean_description_sentiment': None,
                    'mean_content_sentiment': None
                }
        else:
            print(f"No articles found for {current_date.strftime('%Y-%m-%d')} or API error.")
            aggregate_sentiment = {
                'date': current_date.strftime('%Y-%m-%d'),
                'mean_title_sentiment': None,
                'mean_description_sentiment': None,
                'mean_content_sentiment': None
            }
        
        all_aggregate_sentiments.append(aggregate_sentiment)
        current_date += timedelta(days=1)

    return pd.DataFrame(all_aggregate_sentiments)

# Fetch news data and aggregate sentiment scores for the specified date range
news_query = 'oil'
news_start_date = datetime.strptime('2024-07-09', '%Y-%m-%d')
news_end_date = datetime.strptime('2024-08-08', '%Y-%m-%d')

news_sentiments_df = fetch_and_aggregate_sentiment(NEWS_API_KEY, news_query, news_start_date, news_end_date)

# Ensure 'date' is in date format
news_sentiments_df['date'] = pd.to_datetime(news_sentiments_df['date']).dt.date

# Group by date and calculate the mean of sentiment scores where dates overlap
news_sentiments_df = news_sentiments_df.groupby('date').mean().reset_index()

# Save the aggregated sentiment DataFrame to a CSV file
news_sentiments_df.to_csv('news_vader_sentiment.csv', index=False)

# Display the aggregated sentiment DataFrame
print(news_sentiments_df)


# Fetch Guardian news sentiments
guardian_sentiments_df = fetch_guardian_news()

# Combine both DataFrames
combined_sentiments_df = pd.concat([guardian_sentiments_df, news_sentiments_df])

# Group by date and calculate the mean of sentiment scores
combined_sentiments_df = combined_sentiments_df.groupby('date').mean().reset_index()

# Save the combined DataFrame to a CSV file
combined_sentiments_df.to_csv('combined_vader_sentiment.csv', index=False)

# Display the combined DataFrame
print(combined_sentiments_df)




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\TAMILSELVAN\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


          date  mean_title_sentiment  mean_description_sentiment  \
0   2024-07-09             -0.019346                    0.092961   
1   2024-07-10              0.035975                    0.175266   
2   2024-07-11             -0.102986                    0.073126   
3   2024-07-12              0.012971                    0.116878   
4   2024-07-13             -0.022923                    0.051927   
5   2024-07-14              0.028743                    0.189363   
6   2024-07-15             -0.032253                    0.125018   
7   2024-07-16             -0.002119                    0.116483   
8   2024-07-17             -0.029091                    0.106871   
9   2024-07-18              0.001090                    0.106564   
10  2024-07-19              0.026746                    0.160584   
11  2024-07-20             -0.110624                    0.019728   
12  2024-07-21              0.019517                    0.115370   
13  2024-07-22              0.020059            