In [None]:
import pandas as pd
import numpy as np

# Load stock data
raw = pd.read_csv(r'C:\Users\Jrans\Desktop\Model Comparison Project\Data\FinalSet.csv')

# Change data type
raw['date'] = pd.to_datetime(raw['date'], errors='coerce')

# Convert 'date' to period (YYYY-MM format)
raw['date'] = raw['date'].dt.to_period('M')

print(raw.head(),raw.info())

In [None]:
# Drop rows where 'sector' is "0"
raw = raw[raw["sector"] != "0"]

print(f"Final shape: {raw.shape}")

In [None]:
# Define the cutoff date
cutoff_date = pd.Period("2000-11", freq="M")

# Filter the DataFrame to keep only rows from November 2000 onwards
raw0 = raw[raw["date"] >= cutoff_date]

# Display the first few rows
print(raw0.head())

# Optional: Save the filtered data to a CSV file
raw0.to_csv("raw0.csv", index=False)
print("Filtered data saved to raw0.csv")

In [None]:
# Count unique tickers per sector
unique_ticker_counts = raw.groupby('sector')['ticker'].nunique()

# Display result
print(unique_ticker_counts)

In [None]:
# Calculate the timespan per Ticker (in months)
ticker_timespan = raw.groupby(['sector', 'ticker'])['date'].agg(lambda x: (x.max() - x.min()).n)

# Average timespan per sector in years
avg_timespan = ticker_timespan.groupby('sector').mean() / 12

# Round to 2 decimal places
avg_timespan = avg_timespan.round(2)

# Display result
avg_timespan

In [None]:
# Load risk free rate
rfr = pd.read_csv(r"C:\Users\Jrans\Desktop\Model Comparison Project\Data\3month_treasury_rate.csv")
rfr.head()

In [None]:
print(raw0.columns)

## Calculate Rolling Betas
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Ensure the dataset is sorted
raw0 = raw0.sort_values(by=["ticker", "date"])

# Convert `date` column to datetime (fixes "Cannot interpret as period" error)
raw0["date"] = pd.to_datetime(raw0["date"], errors="coerce")

# Drop rows where date could not be converted (if any exist)
raw0 = raw0.dropna(subset=["date"])

# Use 'return' column for beta calculation
return_col = "return"

# Pivot the DataFrame to have tickers as columns
pivot_df = raw0.pivot(index="date", columns="ticker", values=return_col)

# Ensure SPTM (market index) exists
if "SPTM" not in pivot_df.columns:
    raise ValueError("Market index 'SPTM' not found in the dataset.")

# Set rolling window parameters
MIN_WINDOW = 36  # Minimum of 36 months
MAX_WINDOW = 60  # Maximum of 60 months

# Prepare an empty DataFrame for betas
betas = pd.DataFrame(index=pivot_df.index, columns=pivot_df.columns)

# Function to calculate rolling betas
def rolling_beta(y, x):
    """
    Perform rolling OLS regression of stock returns (y) against market returns (x).
    Returns NaN if not enough data.
    """
    if len(y) < MIN_WINDOW:
        return np.nan  # Not enough data
    x = sm.add_constant(x)  # Add intercept
    model = sm.OLS(y, x).fit()
    return model.params["SPTM"]  # Extract the beta coefficient

# Print information about the data before calculations
print(f"Number of unique tickers: {len(pivot_df.columns)}")
print(f"Date range: {pivot_df.index.min()} to {pivot_df.index.max()}")
print(f"Using window size: {MAX_WINDOW} months (minimum {MIN_WINDOW} months)")

# Loop through each ticker (excluding the market index "SPTM")
for ticker in pivot_df.columns:
    if ticker == "SPTM":
        continue  # Skip the market index itself

    # Apply rolling regression
    betas[ticker] = (
        pivot_df[[ticker, "SPTM"]]
        .dropna()
        .rolling(window=MAX_WINDOW, min_periods=MIN_WINDOW)
        .apply(lambda df: rolling_beta(df[ticker], df["SPTM"]), raw=False)
    )

# Convert rolling betas from wide to long format (ticker, date, beta)
betas_long = betas.reset_index().melt(id_vars="date", var_name="ticker", value_name="beta")

# Merge rolling betas into the original raw0 DataFrame
raw0_combined = raw0.merge(betas_long, on=["date", "ticker"], how="left")

# Calculate some statistics on the betas
beta_stats = raw0_combined.groupby('ticker')['beta'].agg(['mean', 'std', 'count']).round(3)
print("\nBeta Statistics by Ticker:")
print(beta_stats.head())
print(f"\nTotal number of valid beta calculations: {raw0_combined['beta'].notna().sum()}")

# Save the final dataset with betas
raw0_combined.to_csv("raw0_with_betas.csv", index=False)
print("\nFinal dataset with rolling betas saved to 'raw0_with_betas.csv'.")