In [None]:
!pip install evidently

In [None]:
import pandas as pd
import numpy as np

In [None]:
import yfinance as yf

data = yf.download("GOOG", start="2011-01-01", end="2024-12-31").reset_index()

In [None]:
# Convert MultiIndex column names to a single index by removing second level
if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.droplevel('Ticker')
data.columns.name = None


data.head()

In [None]:
import random

# group and split into batches (yearly)
grouped = data.groupby(data['Date'].dt.year)
batch_data = [group for _, group in grouped]

for batch in batch_data:
  # Calculate VWAP (Volume Weighted Average Price)
  batch['vwap'] = (batch['Close'] * batch['Volume']).cumsum() / batch['Volume'].cumsum()
  batch.reset_index(drop=True, inplace=True)

  # Select a random column
  random_column = random.choice([col for col in batch.columns if col != 'vwap' and col != 'Date'])

  # Determine how many values to nullify (between 0 and 50)
  num_nulls = random.randint(0, 5)

  # Select random indices to replace with NaN
  null_indices = random.sample(range(len(batch)), num_nulls) if num_nulls > 0 else []

  # Assign NaN to selected indices
  batch.loc[null_indices, random_column] = None

In [None]:
from evidently.metric_preset import DataQualityPreset
from evidently.report import Report

df = pd.concat(batch_data, axis=0).reset_index(drop=True)

data_quality_report = Report(metrics=[
    DataQualityPreset(),
])

data_quality_report.run(reference_data=None, current_data=df)
data_quality_report.show(mode='inline')

In [None]:
from sklearn.preprocessing import StandardScaler

# Provided preprocessing function
def preprocessing(df):
    df.dropna(inplace=True)

    cols_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume', 'vwap']
    scaler = StandardScaler()
    df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

    df['target'] = (df['Close'].diff() > 0).astype(int)
    df.loc[0, 'target'] = 0

    X = df.drop(columns=['Date', 'target'])
    y = df['target'].astype(int)

    return X, y

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from evidently.metric_preset import DataDriftPreset, DataQualityPreset, TargetDriftPreset
from evidently.report import Report

# Initialize lists to store results
drift_reports = []
quality_reports = []
target_drift_reports = []

# Iterate through yearly batches (starting from year index 3 to ensure reference data exists)
for i in range(3, len(batch_data)):
    print(f"Processing Year: {batch_data[i]['Date'].dt.year.iloc[0]}...")

    # Define reference and current datasets
    reference_data = pd.concat(batch_data[i-3:i-1]).reset_index(drop=True)  # Previous 2 years as reference
    current_data = batch_data[i].reset_index(drop=True)  # Current year as test set

    # Preprocess reference and current data
    ref_X, ref_y = preprocessing(reference_data)
    curr_X, curr_y = preprocessing(current_data)

    # Train logistic regression model
    model = LogisticRegression()
    model.fit(ref_X, ref_y)

    # Predict on current batch
    y_pred = model.predict(curr_X)
    accuracy = accuracy_score(curr_y, y_pred)

    print(f"Accuracy for Year {batch_data[i]['Date'].dt.year.iloc[0]}: {accuracy:.4f}")

    # Generate Evidently AI reports
    quality_report = Report(metrics=[DataQualityPreset()])
    quality_report.run(reference_data=None, current_data=current_data)
    quality_reports.append(quality_report)

    drift_report = Report(metrics=[DataDriftPreset()])
    drift_report.run(reference_data=reference_data, current_data=current_data)
    drift_reports.append(drift_report)

    target_drift_report = Report(metrics=[TargetDriftPreset()])
    target_drift_report.run(reference_data=reference_data, current_data=current_data)
    target_drift_reports.append(target_drift_report)