In [30]:
COMPANIES_NEWS_PATH =  r'D:\Projects\Quantum Bits\News-Impact-on-Stock-Prices-\Project_data\output\campanies_news.csv'

In [46]:
from urllib.request import urlopen , Request
from bs4 import BeautifulSoup
import pandas as pd
import re

In [36]:
def get_news(tickers: list):
    """
    Fetches recent news headlines for the given list of stock tickers from Finviz.

    Args:
    tickers (list): A list of stock ticker symbols (e.g., ['AAPL', 'GOOGL']).

    Returns:
    pd.DataFrame: A DataFrame containing the ticker symbol, date, time, and headline of the news.
    """
    finviz_url = 'https://finviz.com/quote.ashx?t='  # Base URL for Finviz stock pages
    news_tables = {}  # Dictionary to store news tables for each ticker
    parsed_data = []  # List to store parsed news data

    # Loop through each ticker to fetch news
    for ticker in tickers:
        url = finviz_url + ticker  # Construct URL for the specific stock ticker
        
        # Send a request to the URL with a user-agent header to avoid being blocked
        req = Request(url, headers={'user-agent': 'google'})
        response = urlopen(req)  # Get the response
        
        # Parse the HTML content of the page
        html = BeautifulSoup(response, 'html.parser')
        
        # Find the news table by its ID and store it in the dictionary
        news_table = html.find('table', {'id': 'news-table'})
        news_tables[ticker] = news_table

    # Loop through each ticker's news table to extract headlines and timestamps
    for ticker, news_table in news_tables.items():
        for rows in news_table.findAll('tr'):  # Iterate through each row in the table
            title = rows.a.text.strip()  # Extract the headline text
            date_time_data = rows.td.text.strip().split(' ')  # Extract date and time
            
            # Check if only time is available (no date)
            if len(date_time_data) == 1:
                time = date_time_data[0]
            else:
                date = date_time_data[0]  # Extract the date
                time = date_time_data[1]  # Extract the time
                
            # Append the parsed data to the list
            parsed_data.append([ticker, date, time, title])
    
    # Convert the parsed data into a DataFrame
    df = pd.DataFrame(parsed_data, columns=['Ticker', 'Date', 'Time', 'Headline'])
    
    return df  # Return the DataFrame


In [50]:
tickers = ['DELL' , 'AMZN' , 'META' , 'NVDA']

df = get_news(tickers)

In [53]:
df

Unnamed: 0,Ticker,Date,Time,Headline
0,DELL,Today,10:30AM,Dell Technologies Inc. (DELL): This AI Stock I...
1,DELL,Aug-30-24,04:40PM,"These Stocks Moved the Most Today: Intel, Supe..."
2,DELL,Aug-30-24,04:36PM,Heard on the Street: Dell Shows AI Isn't a Los...
3,DELL,Aug-30-24,04:17PM,AI Computer Sales Fuel Dell's Better-Than-Expe...
4,DELL,Aug-30-24,02:25PM,"These Stocks Are Moving the Most Today: Intel,..."
...,...,...,...,...
395,NVDA,Aug-30-24,04:05AM,Forget Nvidia: Could This Be the Most Underrat...
396,NVDA,Aug-30-24,03:34AM,"Nvidia's Stock Has Peaked, and 1 Under-the-Rad..."
397,NVDA,Aug-30-24,02:00AM,The Stock Market Is in the Homestretch of 2024...
398,NVDA,Aug-30-24,12:13AM,Nvidias Future Relies on Chips That Push Techn...


# Data Cleaning

In [44]:
df.isna().sum()

Ticker      0
Date        0
Time        0
Headline    0
dtype: int64

In [55]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans and formats the news DataFrame by removing special characters and converting text to lowercase.

    Args:
    df (pd.DataFrame): The DataFrame containing scraped news headlines.

    Returns:
    pd.DataFrame: The cleaned and formatted DataFrame.
    """
    # Remove special characters from the 'Headline' column and convert text to lowercase
    df['Headline'] = df['Headline'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.lower()))
    
    # Optional: Convert 'Date' column to a standard date format
    df['Date'] = df['Date'].apply(lambda x: x.replace('Today', pd.to_datetime('today').strftime('%b-%d-%y')))

    return df

In [56]:
df.head()

Unnamed: 0,Ticker,Date,Time,Headline
0,DELL,Today,10:30AM,Dell Technologies Inc. (DELL): This AI Stock I...
1,DELL,Aug-30-24,04:40PM,"These Stocks Moved the Most Today: Intel, Supe..."
2,DELL,Aug-30-24,04:36PM,Heard on the Street: Dell Shows AI Isn't a Los...
3,DELL,Aug-30-24,04:17PM,AI Computer Sales Fuel Dell's Better-Than-Expe...
4,DELL,Aug-30-24,02:25PM,"These Stocks Are Moving the Most Today: Intel,..."


In [57]:
cleaned_df = clean_data(df)
cleaned_df.head()

Unnamed: 0,Ticker,Date,Time,Headline
0,DELL,Aug-31-24,10:30AM,dell technologies inc dell this ai stock is tr...
1,DELL,Aug-30-24,04:40PM,these stocks moved the most today intel super ...
2,DELL,Aug-30-24,04:36PM,heard on the street dell shows ai isnt a loss ...
3,DELL,Aug-30-24,04:17PM,ai computer sales fuel dells betterthanexpecte...
4,DELL,Aug-30-24,02:25PM,these stocks are moving the most today intel s...


# save the data

In [58]:
cleaned_df.to_csv(COMPANIES_NEWS_PATH , index=False)