In [2]:

import pandas as pd 
import sys
import os

# Get the absolute path to the 'scripts' directory
scripts_dir = os.path.abspath("../Scripts")
print(f"Scripts directory path: {scripts_dir}")

# Add 'scripts' directory to sys.path
sys.path.append(scripts_dir)

# Define the load_data function

def load_data(file_paths):
    dataframes = []
    for file_path in file_paths:
        print(f"Attempting to load: {file_path}")  # Log file being processed
        if not os.path.isfile(file_path):
            dataframes.append(f"File does not exist: {file_path}")
            continue
        
        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file_path.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                raise ValueError(f"Unsupported file type for {file_path}")
            dataframes.append(df)
            print(f"Successfully loaded: {file_path}")  # Log successful load
        except Exception as e:
            dataframes.append(f"Error loading {file_path}: {str(e)}")
            print(f"Error loading {file_path}: {str(e)}")  # Log the error
    return dataframes

# File paths
file_paths = [
    '../data/AAPL_historical_data.csv',
    '../data/AMZN_historical_data.csv',
    '../data/GOOG_historical_data.csv',
    '../data/MSFT_historical_data.csv',
    '../data/META_historical_data.csv',
    '../data/NVDA_historical_data.csv',
    '../data/raw_analyst_ratings.csv',
    '../data/TSLA_historical_data.csv'
]

# Load data
dataframes = load_data(file_paths)

# Display basic information about each dataset
for i, df in enumerate(dataframes):
    print(f"Dataset {i+1}:")
    if isinstance(df, pd.DataFrame):
        print(df.info())
        print("\nHead of the data:")
        print(df.head())
        print("\nMissing values:")
        print(df.isnull().sum())
    else:
        print(df)  # Print error message if it's not a DataFrame
    print("\n")

Scripts directory path: c:\Users\hp\Desktop\Projects\ML\KAIM\News-Sentiment-Analyzer\venv\Scripts
Attempting to load: ../data/AAPL_historical_data.csv
Successfully loaded: ../data/AAPL_historical_data.csv
Attempting to load: ../data/AMZN_historical_data.csv
Successfully loaded: ../data/AMZN_historical_data.csv
Attempting to load: ../data/GOOG_historical_data.csv
Successfully loaded: ../data/GOOG_historical_data.csv
Attempting to load: ../data/MSFT_historical_data.csv
Successfully loaded: ../data/MSFT_historical_data.csv
Attempting to load: ../data/META_historical_data.csv
Successfully loaded: ../data/META_historical_data.csv
Attempting to load: ../data/NVDA_historical_data.csv
Successfully loaded: ../data/NVDA_historical_data.csv
Attempting to load: ../data/raw_analyst_ratings.csv
Successfully loaded: ../data/raw_analyst_ratings.csv
Attempting to load: ../data/TSLA_historical_data.csv
Successfully loaded: ../data/TSLA_historical_data.csv
Dataset 1:
<class 'pandas.core.frame.DataFrame'>

In [3]:
import sys
import os

# Add the scripts directory to the Python path
sys.path.append(os.path.abspath('../Scripts'))

# Load the raw analyst ratings data
raw_analyst_ratings_df = pd.read_csv('../data/raw_analyst_ratings.csv')

def eda_analyst_ratings(df):
    
    # Convert the date column to datetime, handling timezone info correctly
    df['date'] = pd.to_datetime(df['date'], utc=True, errors='coerce')

    # Print the data types after conversion
    print("\nData type of 'date' column after conversion:")
    print(df['date'].dtype)

    # Print the first few rows of the date column after conversion
    print("\nConverted 'date' column:")
    print(df['date'])

    # Drop rows with invalid dates
    df = df.dropna(subset=['date'])

    # 1. Basic statistics for textual lengths (like headline length)
    df['headline_length'] = df['headline'].apply(len)
    print(f"Descriptive Statistics for headline length:\n{df['headline_length'].describe()}\n")
    
    # 2. Count the number of articles per publisher
    publisher_counts = df['publisher'].value_counts()
    print(f"Article Counts by Publisher:\n{publisher_counts}\n")
    
    # 3. Analyze the publication dates
    publication_trends = df['date'].dt.date.value_counts().sort_index()
    print(f"Publication Trends over Time:\n{publication_trends}\n")

    return df

# Perform EDA on the data
processed_df = eda_analyst_ratings(raw_analyst_ratings_df)

# Display the first few rows of the processed DataFrame
processed_df.head()


Data type of 'date' column after conversion:
datetime64[ns, UTC]

Converted 'date' column:
0         2020-06-05 14:30:54+00:00
1         2020-06-03 14:45:20+00:00
2         2020-05-26 08:30:07+00:00
3         2020-05-22 16:45:06+00:00
4         2020-05-22 15:38:59+00:00
                     ...           
1407323                         NaT
1407324                         NaT
1407325                         NaT
1407326                         NaT
1407327                         NaT
Name: date, Length: 1407328, dtype: datetime64[ns, UTC]
Descriptive Statistics for headline length:
count    55987.000000
mean        80.015254
std         56.126094
min         12.000000
25%         42.000000
50%         63.000000
75%         91.000000
max        512.000000
Name: headline_length, dtype: float64

Article Counts by Publisher:
publisher
Benzinga Newsdesk    14750
Lisa Levin           12408
ETF Professor         4362
Paul Quintaro         4212
Benzinga Newsdesk     3177
                     ..

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['headline_length'] = df['headline'].apply(len)


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,headline_length
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 14:30:54+00:00,A,39
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 14:45:20+00:00,A,42
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 08:30:07+00:00,A,29
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 16:45:06+00:00,A,44
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 15:38:59+00:00,A,87
