In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('financial_data.csv')

# Ensure 'Date' column is datetime index
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date').set_index('Date')

# Handle missing values (use assignment instead of inplace)
df['closing_price'] = df['closing_price'].ffill()
df['volume'] = df['volume'].ffill()

df['closing_price'] = df['closing_price'].fillna(df['closing_price'].median())
df['volume'] = df['volume'].fillna(df['volume'].median())

# Lag features
df['1_day_return'] = df['closing_price'].pct_change(1).fillna(0)
df['7_day_return'] = df['closing_price'].pct_change(7).fillna(0)

# Log-scale volume
df['volume_log'] = np.log1p(df['volume'])

# Outlier detection using IQR
Q1 = df['closing_price'].quantile(0.25)
Q3 = df['closing_price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df['closing_price_outlier'] = (
    (df['closing_price'] < lower_bound) | 
    (df['closing_price'] > upper_bound)
)

print(df.head())


         date  closing_price  volume  1_day_return  7_day_return  volume_log  \
0  2025-01-01         149.75  5000.0      0.000000           0.0    8.517393   
1  2025-01-02         131.04  2000.0     -0.124942           0.0    7.601402   
2  2025-01-03         138.26  2000.0      0.055098           0.0    7.601402   
3  2025-01-04         164.68  2000.0      0.191089           0.0    7.601402   
4  2025-01-05         165.06  5000.0      0.002308           0.0    8.517393   

   closing_price_outlier  
0                  False  
1                  False  
2                  False  
3                  False  
4                  False  
