# Testing for Alpha Vantage api

In [None]:
import requests
import json

# replace the "demo" apikey below with your own key from https://www.alphavantage.co/support/#api-key
url = 'https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=AAPL&apikey=1RZHEEKZF7GJSSST&time_from=20220401T0000&time_to=20220430T2359&sort=EARLIEST&limit=1000'
r = requests.get(url)
data = r.json()
pretty_data = json.dumps(data,indent=5)
print(pretty_data)
with open('data.json','w',encoding='utf-8') as f:
    json.dump(data,f,ensure_ascii=False,indent=5)

# Real news data gathering starts from here. 
Pros: Gives title, summary, published date, stock symbol, news sentiment score, and news sentiment label

Cons: Can't provided data for year 2020 or before, (tentative I have to check for 21 and 22 also, provides data from 2023)

In [139]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import json
import re

In [140]:
# Your Alpha Vantage API key
API_KEYS = ['N133B8ITLPTU45Y2','U7J2H9AUXI5A3Y7B','P140IFC9IPWNGWS4','KXC6FJWDP2IBPFYC','P3WMQIYK1B8BD17T','1P6HUJCX8F7V9ZT9','N07GTEI26X0IH5FM']
cur_api_idx = 0
cur_api_count = 0
ALPHA_VANTAGE_API_KEY = API_KEYS[cur_api_idx]
BASE_URL = "https://www.alphavantage.co/query?function=NEWS_SENTIMENT"

In [141]:
# List of tickers for which to retrieve news
tickers = ['AAPL', 'TSLA', 'AMZN', 'NKE', 'NVDA']

# Overall date range for the news retrieval
overall_start = datetime(2022, 3, 1)
overall_end = datetime(2025, 3, 31)

In [142]:
def get_monthly_ranges(start_date, end_date):
    """
    Breaks the overall date range into monthly intervals.
    Returns a list of tuples, each with a (formatted_month_start, formatted_month_end) string,
    using the format 'YYYYMMDDTHHMM'.
    """
    ranges = []
    current = start_date
    while current <= end_date:
        month_start = current.replace(day=1)
        month_end = (month_start + relativedelta(months=1)) - timedelta(days=1)
        # Ensure the month_end does not exceed the overall end date
        if month_end > end_date:
            month_end = end_date
        # Format both start and end dates to 'YYYYMMDDTHHMM'
        start_str = month_start.strftime("%Y%m%dT%H%M")
        end_str = month_end.strftime("%Y%m%dT%H%M")
        ranges.append((start_str, end_str))
        current = month_end + timedelta(days=1)
    return ranges

monthly_ranges = get_monthly_ranges(overall_start, overall_end)
print(len(monthly_ranges))

37


In [143]:
def get_url(ticker,start_date,end_date,api_key,limit=1000,sort='EARLIEST'):
    ticker_url = BASE_URL
    ticker_url += '&tickers=' + ticker
    ticker_url += '&apikey=' + api_key
    ticker_url += '&time_from=' + start_date
    ticker_url += '&time_to=' + end_date
    ticker_url += '&sort=' + sort
    ticker_url += '&limit=' + str(limit)
    return ticker_url

def escape_unicode(text):
    # This will convert non-ASCII characters to their \uXXXX escape sequences
    return text.encode('unicode-escape').decode('ascii')

def remove_invalid_control_chars(text):
    # Remove control characters except tab (\x09), newline (\x0A), and carriage return (\x0D)
    #text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', text)
    #text = re.sub(r'\\"',"'",text)
    #text = re.sub(r'""','"',text)
    # Step 2: Remove any extra quotes that come directly after a control character (like \x04")
    # This example simply removes occurrences of \x04" (control character and a quote)
    text = text.replace('"b"', '"',10000)
    text = text.replace('b"', '',10000)
    return text

In [144]:
def get_news_for_ticker_alphavantage(ticker_url,ticker):
    """
    Retrieves news for a given ticker from the Alpha Vantage NEWS_SENTIMENT endpoint
    for the specified date range.
    
    Parameters:
      - ticker_url: API call url
      
    Returns:
      A list of dictionaries, each representing an article with:
        - published_date: Publication date (string format from API)
        - title: Article title
        - summary: Article summary
        - ticker: The current ticker symbol
        - ticker_sentiment_score: Sentiment score for the ticker (if available)
        - ticker_sentiment_label: Sentiment label for the ticker (if available)
    """
    API_KEY = "6adf4292cfc8356b943932b74f1e32bb"
    scrape_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={ticker_url}"
    print(scrape_url)

    try:
        response = requests.get(scrape_url, timeout=120)
        response.raise_for_status()  # Will raise an HTTPError if not 2xx
        decoded_text = response.content.decode('utf-8-sig')
        decoded_text = remove_invalid_control_chars(decoded_text)
        pattern = r'(?<=\w)"(?=\s*\w)'
        cleaned_text = re.sub(pattern, '', decoded_text)
        try:
            data = json.loads(cleaned_text,strict=False)
        except json.JSONDecodeError as e:
            print(f"[!] JSONDecodeError for {ticker_url}: {e}")
            #print(f"[!] Response text (first 500 chars):\n{str(cleaned_text)}")
            return []
    except requests.RequestException as e:
        print(f"[!] RequestException for {ticker_url}: {e}")
        return []

    articles = []
    if 'feed' in data:
        for item in data['feed']:
            published_date = item.get("time_published", "")
            title = item.get("title", "")
            summary = item.get("summary", "")
            ticker_sentiments = item.get("ticker_sentiment", [])
            ticker_sentiment_score = None
            ticker_sentiment_label = None
            for sentiment in ticker_sentiments:
                if sentiment.get("ticker") == ticker:
                    ticker_sentiment_score = sentiment.get("ticker_sentiment_score")
                    ticker_sentiment_label = sentiment.get("ticker_sentiment_label")
                    break

            articles.append({
                "published_date": published_date,
                "title": title,
                "summary": summary,
                "ticker": ticker,
                "ticker_sentiment_score": ticker_sentiment_score,
                "ticker_sentiment_label": ticker_sentiment_label
            })
    else:
        print(f"[!] No 'feed' found for {ticker_url}. Response:\n{json.dumps(data, indent=2)}")
    return articles

In [145]:
for ticker in tickers[4:]:
    ticker_articles = []
    for start, end in monthly_ranges:
        print(f"Fetching {ticker} news from {start} to {end}")
        ticker_url = get_url(ticker,start,end,API_KEYS[cur_api_idx])
        #print(ticker_url)
        cur_api_count += 1
        if (cur_api_count%25 == 0):
            cur_api_count = 0
            cur_api_idx += 1
        articles = get_news_for_ticker_alphavantage(ticker_url,ticker)
        if(len(articles)>0):
            ticker_articles.extend(articles)
    
    # Optionally deduplicate articles based on title and published date to remove overlaps
    df = pd.DataFrame(ticker_articles).drop_duplicates(subset=["title", "published_date"])
    df.to_csv(f"../3years_results/{ticker}_alpha_news_data.csv", index=False)
    #all_articles.append(df)
    
print("All News articles saved.")

Fetching NVDA news from 20220301T0000 to 20220331T0000
http://api.scraperapi.com?api_key=6adf4292cfc8356b943932b74f1e32bb&url=https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=NVDA&apikey=N133B8ITLPTU45Y2&time_from=20220301T0000&time_to=20220331T0000&sort=EARLIEST&limit=1000
Fetching NVDA news from 20220401T0000 to 20220430T0000
http://api.scraperapi.com?api_key=6adf4292cfc8356b943932b74f1e32bb&url=https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=NVDA&apikey=N133B8ITLPTU45Y2&time_from=20220401T0000&time_to=20220430T0000&sort=EARLIEST&limit=1000
[!] JSONDecodeError for https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers=NVDA&apikey=N133B8ITLPTU45Y2&time_from=20220401T0000&time_to=20220430T0000&sort=EARLIEST&limit=1000: Expecting ',' delimiter: line 19235 column 29 (char 990082)
Fetching NVDA news from 20220501T0000 to 20220531T0000
http://api.scraperapi.com?api_key=6adf4292cfc8356b943932b74f1e32bb&url=https://www.alphavantage.co/query?funct