In [187]:
import json
import pandas as pd
from google.cloud import bigquery


def load_config(file_path):
    """Load configuration from a JSON file."""
    with open(file_path, "r") as file:
        return json.load(file)


def get_bigquery_data(client, project_id, dataset_name, table_name):
    """Fetch data from a BigQuery table and return it as a pandas DataFrame."""
    table_id = f"{project_id}.{dataset_name}.{table_name}"
    query = f"SELECT * FROM `{table_id}`"
    query_job = client.query(query)  # API request
    return query_job.result().to_dataframe()


# Load configuration
config = load_config("configs/config.json")
PROJECT_ID = config["PROJECT_ID"]
DATASET_NAME = config["DATASET_NAME"]

# Initialize the BigQuery client
client = bigquery.Client()

# Fetch reference and current data
result_cur = get_bigquery_data(client, PROJECT_ID, DATASET_NAME, "prediction")
result_ref_df = get_bigquery_data(client, PROJECT_ID, DATASET_NAME, "raw_data")

# Initialize an empty list to store individual DataFrames
dataframes = []

# Iterate over all rows in the `input_data` column
for json_string in result_cur["input_data"]:
    # Parse JSON string into a dictionary
    json_data = json.loads(json_string)
    # Create a DataFrame for the current row's input_data
    df = pd.DataFrame(data=json_data["data"], columns=json_data["columns"])
    # Append to the list
    dataframes.append(df)

# Concatenate all DataFrames into one big DataFrame
result_cur_df = pd.concat(dataframes, ignore_index=True)

In [188]:
from LocalRun.src.extract_data import extract_data
result_cur_df = extract_data(result_cur_df)
result_ref_df = extract_data(result_ref_df)

In [189]:
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(reference_data=result_ref_df, current_data=result_cur_df)
report_dict = data_drift_report.as_dict()

In [None]:
report_dict

In [None]:
# Access the drift share
drift_share = report_dict['metrics'][0]['result']['drift_share']
print("Drift Share:", drift_share)

# Check if dataset drift was detected
dataset_drift = report_dict['metrics'][0]['result']['dataset_drift']
print("Dataset Drift Detected:", dataset_drift)

In [None]:
# Access drift information for a specific column, e.g., 'Trip Miles'
trip_miles_drift = report_dict['metrics'][1]['result']['drift_by_columns']['Trip Miles']

# Extract specific details for 'Trip Miles'
column_name = trip_miles_drift['column_name']
drift_detected = trip_miles_drift['drift_detected']
drift_score = trip_miles_drift['drift_score']

print("Column Name:", column_name)
print("Drift Detected:", drift_detected)
print("Drift Score:", drift_score)

In [None]:
# Get current and reference distributions for 'Trip Miles'
current_distribution = trip_miles_drift['current']['small_distribution']
reference_distribution = trip_miles_drift['reference']['small_distribution']

print("Current Distribution X:", current_distribution['x'])
print("Current Distribution Y:", current_distribution['y'])
print("Reference Distribution X:", reference_distribution['x'])
print("Reference Distribution Y:", reference_distribution['y'])

In [None]:
drifted_columns = report_dict['metrics'][1]['result']['drift_by_columns']
message = []
for column, details in drifted_columns.items():
    print(f"Column: {column}")
    print(f"  Drift Detected: {details['drift_detected']}")
    print(f"  Drift Score: {details['drift_score']}")
    message.append(f"Drift Detected for '{column}' with Drift Score: '{details['drift_score']}'")
    
print(message)

In [None]:
company_drift = report_dict['metrics'][1]['result']['drift_by_columns']['Company']
company_drift_detected = company_drift['drift_detected']
company_drift_score = company_drift['drift_score']

print("Company Drift Detected:", company_drift_detected)
print("Company Drift Score:", company_drift_score)

In [None]:
metri = data_drift_report.as_dataframe()