In [None]:
import os
import glob
import numpy as np
import pandas as pd
import requests
import time

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# for Data
#import yfinance as yf


# Extract Raw data
PATH = os.path.expanduser("~/investor/data/raw")
all_files = glob.glob(os.path.join(PATH , "*.csv"))

all_data = []

for filename in all_files:
    file = pd.read_csv(filename, index_col=None, header=0)
    all_data.append(file)
    
df = pd.concat(all_data, axis=0, ignore_index=True)

# Renaming the columns by removing the '<' and '>' characters
new_column_names = {col: col.strip('<>').upper() for col in df.columns}
df = df.rename(columns=new_column_names)

# changing the integer dates to datetime format, taking 2002- NOW:
df = df[np.logical_and(df['DATE'] <= '2024-01-01', df['DATE'] > '2001-12-31')]

df.info()

# OHLC prices: These are essential for technical analysis and commonly used to derive various technical indicators.
# Volume: Trading volume provides insights into market activity and liquidity.
# Calculate discrete returns
# df['discrete_return'] = (df['CLOSE'] - df['CLOSE'].shift(1)) / df['CLOSE'].shift(1)
df['discrete_return'] = np.log(df['OPEN']/df['OPEN'].shift(1)) # opposed to closing prices, to avoid look-ahead bias.

print(df.describe())
df.head()

# Pairplot to visualize relationships between numerical variables
sns.pairplot(df[['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME']], corner=True)
plt.title('Pairplot of Numerical Variables')
plt.show()

df_A['DATE'] = pd.to_datetime(df_A['DATE'])

# Extract year from the 'DATE' column
df_A['Year'] = df_A['DATE'].dt.year

# Aggregate data by year and compute the mean for each feature
yearly_data = df_A[['Year','OPEN', 'HIGH', 'LOW', 'CLOSE']].groupby('Year').mean()

# Reset index to have 'Year' as a regular column
yearly_data.reset_index(inplace=True)

# Melt the DataFrame to have a single x-axis for all features
melted_yearly_data = yearly_data.melt(id_vars=['Year'], var_name='Feature', value_name='Mean Value')

# Create a line plot
plt.figure(figsize=(8, 6))
sns.lineplot(data=melted_yearly_data, x='Year', y='Mean Value', hue='Feature')
plt.title('Yearly Mean of Features')
plt.xlabel('Year')
plt.ylabel('Mean Value')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()