In [83]:
import requests
import pandas as pd
import datetime
import time
import random
import re
import spacy
import yfinance as yf

In [66]:
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta

In [67]:
nlp = spacy.load('en_core_web_sm')

# Scraping news dataset

In [84]:
def scrape_news(query, start_date, end_date, max_entries = 5):
    """
    Scrapes news articles from Google News RSS for a given query and filters them by data.

    Parameters:
        query(str): the stock or company name to search for.
        start_date(date): the start date for filtering articles.
        end_date(date): the end date for filtering articles.
        max_entries(int): number of times to retry incase of request failure.

    returns:
        pd.DataFrame: a dataframe consisting of article title, link, and publication date
    """
    articles = []
    url = f"https://news.google.com/rss/search?q={query}+after:{start_date}+before:{end_date}"
    for i in range(max_entries):
        try:
            response = requests.get(url)
            break
        except exception as e:
            print(f"an error occured in requesting {e}")
            print(f"retrying attempt {i}")
            time.sleep(random.uniform(2,10))
        
    soup = BeautifulSoup(response.content, 'xml')
    items = soup.find_all('item')
    
    for item in items:
        title = item.title.text
        link = item.link.text
        pub_date = item.pubDate.text
        articles.append({'title': title, 'link': link, 'pub_date': pub_date})
    return pd.DataFrame(articles)

In [30]:
def scrape_news_over_date_range(query, start_date, end_date):
    """
    Scrapes news articles over a given date range by iterating through months.

    Parameters:
        query(str): The stock or company name to search for.
        start_date(str): the start date in 'yyyy-mm-dd' format.
        end_date(str): the end date in 'yyyy-mm-dd' format.

    Returns:
        pd.DataFrame: A DataFrame collected across the date range.
    """
    start_date_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d').date()
    end_date_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d').date()

    all_articles = pd.DataFrame()

    current_date = start_date_dt

    while current_date<end_date_dt:
        next_date = min(current_date + relativedelta(months = 1), end_date_dt)
        
        start_str = current_date.strftime('%Y-%m-%d')
        end_str = next_date.strftime('%Y-%m-%d')
        month_articles = scrape_news(query, start_str, end_str)
        all_articles = pd.concat([all_articles, month_articles], ignore_index=True)
        current_date = next_date

    return all_articles    

In [31]:
def scrape_and_save_news(queries, start_date, end_date):
    """
    Scrapes news articles for mutliple queries over a gives date range and saves each result to a csv file

    Parameters:
        queries(list): A list of stock or company names to search for.
        start_date(str): the start date in 'yyyy-mm-dd' format.
        end_date(str): the end date in 'yyyy-mm-dd' format.

    Outputs:
        CSV files named after each query, containing scraped news articles.
    """
    for query in queries:
        articles = scrape_news_over_date_range(query, start_date, end_date)
        output_csv = f"{query}.csv"
        articles.to_csv(output_csv, index=False)
        print(f"Saved {query} articles to {output_csv}")

In [32]:
queries = ["Reliance", "Microsoft", "Google"]
start_date = '2020-01-01'
end_date = '2022-12-31'
scrape_and_save_news(queries, start_date, end_date)

Saved Reliance articles to Reliance.csv
Saved Microsoft articles to Microsoft.csv
Saved Google articles to Google.csv


In [45]:
news_df1 = pd.read_csv('Reliance.csv')
news_df2 = pd.read_csv('Microsoft.csv')
news_df3 = pd.read_csv('Google.csv')

# Pre-Processing

In [75]:
def clean_text(text: str):
    """
    Cleans the text and returns in form of tokens.

    Parameters:
    text(str): string to be cleaned.

    Returns:
    str: a joint of tokens created.
    """
    text = text.lower()
    doc = nlp(text.lower())

    tokens = [token.lemma_ for token in doc 
             if not token.is_stop 
             and not token.is_punct 
             and token.text not in ["'", '"']]

    return ' '.join(tokens)

In [76]:
news_df1['cleaned_text'] = news_df1['title'].apply(clean_text)
news_df2['cleaned_text'] = news_df2['title'].apply(clean_text)
news_df2['cleaned_text'] = news_df2['title'].apply(clean_text)

# Extracting Stock Prices for supervised training

In [99]:
start_date = '2022-01-01'
end_date = '2022-12-31'

stock_symbols = {'Reliance': 'RELIANCE.NS', 'Microcosft': 'MSFT', 'Google': 'GOOGL'}

date_range = pd.date_range(start = start_date, end = end_date, freq = 'B') # 'B' for business days

def fetch_stock_data(symbol:str, max_entries =3):
    """
    fetches stock data from yahoo.

    Parameters:
    symbol(str): symbol corresponding to the country name.
    max_entries(int): no. of retries in fetching if the fetch fails.

    Returns: 
    pd.DataFrame: dataset containing stockprices
    """
    for i in range(max_entries):
        try:
            stock_data = yf.download(symbol, start= start_date, end= end_date)
        except Exception as e:
            print (f'error fetching data {e}; retrying attempt {i}.....')
        time.sleep(random.uniform(2,5))

        return stock_data

In [101]:
for company, symbol in stock_symbols.items():
    stock_data = fetch_stock_data(symbol)
    stock_data.reset_index(inplace = True)
    output_csv = f'{company}_stock_prices.csv'
    stock_data.to_csv(output_csv, index = False)
    print(f"stock_price for {company} saved")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

stock_price for Reliance saved



[*********************100%***********************]  1 of 1 completed

stock_price for Microcosft saved





stock_price for Google saved


In [102]:
stock_df1 = pd.read_csv('Reliance_stock_prices.csv')
stock_df2 = pd.read_csv('Microsoft_stock_prices.csv')
stock_df2 = pd.read_csv('Google_stock_prices.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Microsoft_stock_prices.csv'