In [None]:
%pip install pandas yfinance tqdm

In [5]:
import pandas as pd
import yfinance as yf
from tqdm import tqdm
import numpy as np

In [None]:
# Specify the input file path for macOS
input_file = '/Users/christopher/Desktop/Seminar-Advanced-Finance/stocks.csv'

# Specify the output file path for macOS
output_file = '/Users/christopher/Desktop/Seminar-Advanced-Finance/stocks_data.csv'

# Specify the output file path for macOS
output_file2 = '/Users/christopher/Desktop/Seminar-Advanced-Finance/stocks_data2.csv'

# Read the CSV file
stocks_df = pd.read_csv(input_file)

In [None]:
# Filter rows to only include tickers in the 'Financials' and 'Industrials' sectors
stocks_df = stocks_df[stocks_df['GICS Sector'].isin(['Financials', 'Industrials'])]

# List to accumulate data frames for each ticker
final_df = pd.DataFrame()

# List to accumulate data frames for each ticker
final_df2 = pd.DataFrame()

In [20]:
# Loop (iteration) over each row/ticker
for idx, row in tqdm(stocks_df.iterrows(), 
                     total=stocks_df.shape[0],
                     desc="Downloading stock data"):
    
    # Extract the ticker symbol & GICS sector from the row
    ticker = row['Symbol']       
    gics_sector = row['GICS Sector']

    # Download data from yfinance
    data = yf.download(ticker, start="2000-01-01", end="2025-01-01", progress=False, auto_adjust=False)

    # Add 'Ticker' and 'GICS Sector' as normal columns
    data["Ticker"] = ticker
    data["GICS Sector"] = gics_sector

    # 3) Reset index so we have a "Date" column (instead of an index)
    data.reset_index(inplace=True)

    data.columns = data.columns.droplevel(1)

    data = data[['Date', 'Adj Close', 'Ticker', 'GICS Sector']]

    final_df = pd.concat([final_df, data], ignore_index=True)

Downloading stock data:  11%|█▏        | 17/151 [00:04<00:38,  3.46it/s]


KeyboardInterrupt: 

In [148]:
final_df.to_csv(output_file, index=False)

In [46]:
# Loop over each row/ticker
for idx, row in tqdm(stocks_df.iterrows(), total=stocks_df.shape[0], desc="Downloading stock data"):
    ticker = row['Symbol']
    gics_sector = row['GICS Sector']

    # Download data from yfinance for a single ticker
    data2 = yf.download(
        tickers=ticker,
        start="1999-01-01",
        end="2025-01-01",
        progress=False,
        auto_adjust=False
    )

    # Skip if no data was returned
    if data2.empty:
        continue

    # If columns is a MultiIndex, drop the second level
    if isinstance(data2.columns, pd.MultiIndex):
        data2.columns = data2.columns.droplevel(1)

    # Add 'Ticker' and 'GICS Sector' as normal columns
    data2["Ticker"] = ticker
    data2["GICS Sector"] = gics_sector

    # Reset index so we have a "Date" column (instead of an index)
    data2.reset_index(inplace=True)

    # Calculate daily log returns
    data2['log_return'] = np.log(data2['Adj Close'] / data2['Adj Close'].shift(1))

    # Calculate volatility as the rolling standard deviation of log returns over 63 trading days
    data2['Volatility'] = data2['log_return'].rolling(window=63).std()

    # Drop the temporary 'log_return' column if not needed
    data2.drop(columns=['log_return'], inplace=True)

    # Select the relevant columns
    data2 = data2[['Date', 'Adj Close', 'Volatility', 'Ticker', 'GICS Sector']]

    # Append the processed data to the final dataframe (long format)
    final_df2 = pd.concat([final_df2, data2], ignore_index=True)

# final_df2 now contains all tickers in a single, long-format DataFrame


Downloading stock data: 100%|██████████| 151/151 [00:44<00:00,  3.38it/s]


In [50]:
# Ensure 'Date' is a datetime column
final_df2['Date'] = pd.to_datetime(final_df2['Date'])

# Filter out rows where the year is 1999
final_df2 = final_df2[final_df2['Date'].dt.year != 1999]

In [53]:
# Make sure 'Date' is datetime and sort by it if necessary
final_df2['Date'] = pd.to_datetime(final_df2['Date'])
final_df2.sort_values(['Ticker', 'Date'], inplace=True)

def fill_earliest_volatility(group):
    # Find the first non-NaN volatility for this ticker group
    non_na_vol = group['Volatility'].dropna()
    if not non_na_vol.empty:
        earliest_vol = non_na_vol.iloc[0]
        # Fill missing values with the earliest available volatility
        group['Volatility'] = group['Volatility'].fillna(earliest_vol)
    return group

final_df2 = final_df2.groupby('Ticker', group_keys=False).apply(fill_earliest_volatility)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df2['Date'] = pd.to_datetime(final_df2['Date'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df2.sort_values(['Ticker', 'Date'], inplace=True)
  final_df2 = final_df2.groupby('Ticker', group_keys=False).apply(fill_earliest_volatility)


In [None]:
final_df2.to_csv(output_file2, index=False)