In [1]:
# %pip install pandas
# %pip install pandas-profiling
# %pip install --upgrade pandas-profiling pydantic-settings
# %pip install prettytable
# %pip install pydantic

In [2]:
from prettytable import PrettyTable
import pandas as pd
import random
import os

In [3]:
# Load raw_df
raw_data_path = os.path.join('data_lake', 'datasets', 'raw_data', 'challenge_MLE.csv')
raw_df = pd.read_csv(raw_data_path, sep=';')

# Load clean_df
clean_data_path = os.path.join('data_lake', 'datasets', 'cleaned_data', 'cleaned_data.csv')
cleaned_df = pd.read_csv(clean_data_path, index_col=0)

# Load ml_data
ml_data_path = os.path.join('data_lake', 'datasets', 'ml_data', 'ml_data.csv')
ml_df = pd.read_csv(ml_data_path, index_col=0)

In [4]:
def print_table_schema(df: pd.DataFrame, title=""):
    df_schema = PrettyTable(
        ["Column Name", "Dtype", "Null Values [%]", "Example"], title=title
    )
    for col in df.columns:
        dtype = df.dtypes[col]
        null_values = round(100*df[col].isna().sum()/df.shape[0], 1)
        example_df = df.loc[df[col].notna()]
        example = random.choice(example_df[col].unique())
        df_schema.add_row([col, dtype, null_values, example])

    display(df_schema)
    print('\n\n')


# print_table_schema(raw_df.copy(), title="Raw Data Schema")
# print_table_schema(cleaned_df.copy(), title="Cleaned Data Schema")
# print_table_schema(ml_df.copy(), title="ML Data Schema")

In [5]:
def filter_data(
    df: pd.DataFrame,
    course_name: str = None,
    user_uuids: list = None,
    course_uuids: list = None,
    particion: int = None,
    pick_random: bool = False
):
    # Filter based on course_name
    if pick_random:
        course_name = random.choice(df['course_name'].unique())

    if course_name is not None:
        df = df.loc[df['course_name'] == course_name]

    # Filter based on user_uuid
    if pick_random:
        user_uuids = list(random.choice(df['user_uuid'].unique()))

    if user_uuids is not None:
        df = df.loc[df['user_uuid'].isin(user_uuids)]

    # Filter based on course_uuids
    if pick_random:
        course_uuids = list(random.choice(df['course_uuid'].unique()))

    if course_uuids is not None:
        df = df.loc[df['course_uuid'].isin(course_uuids)]

    # Filter based on partition
    if pick_random:
        particion = random.choice([p for p in df['particion'].unique() if p > 10])

    if particion is not None:
        df = df.loc[df['particion'] <= particion]
    
    return df

def print_table(
    df: pd.DataFrame, 
    title="", 
    course_name: str = None,
    user_uuids: list = None,
    course_uuids: list = None,
    particion: int = None,
    pick_random: bool = False
):
    df = filter_data(
        df=df,
        course_name=course_name,
        user_uuids=user_uuids,
        course_uuids=course_uuids,
        particion=particion,
        pick_random=pick_random
    )
    pretty_df = PrettyTable(df.columns.tolist(), title=title)
    for row in df.iterrows():
        pretty_df.add_row(row[1].values.tolist())

    display(pretty_df)
    print('\n\n')
    

user_uuids=['13df535e-065c-4593-98ea-5b1e29015b7d']
course_uuids=['09614210-fce2-48bc-93e3-bc4bd441fe00']
particion=44
        
# print_table(
#     raw_df.copy(), 
#     title="Raw Data Example",
#     user_uuids=user_uuids,
#     course_uuids=course_uuids
# )
# print_table(
#     cleaned_df.copy(), 
#     title="Cleaned Data Example",
#     user_uuids=user_uuids,
#     course_uuids=course_uuids
# )
# print_table(
#     ml_df.copy(), 
#     title="ML Data Example",
#     user_uuids=user_uuids,
#     course_uuids=course_uuids
# )

In [8]:
%pip install -e .

Obtaining file:///Users/simongarciamorillo/Library/CloudStorage/OneDrive-Personal/Documents/EdMachina/ed-ml
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: ed-ml
  Attempting uninstall: ed-ml
    Found existing installation: ed-ml 1.0.0
    Uninstalling ed-ml-1.0.0:
      Successfully uninstalled ed-ml-1.0.0
  Running setup.py develop for ed-ml
Successfully installed ed-ml-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
from config.params import Params
from ed_ml.modeling.model_registry import ModelRegistry
from ed_ml.pipeline.pipeline import MLPipeline
from ed_ml.data_processing.feature_engineering import FeatureEngineer
from tqdm import tqdm
import shap
import numpy as np

Initializing Params.

Unable to load base_path.
Exception: /Users/simongarciamorillo/Library/CloudStorage/OneDrive-Personal/Documents/BetterTradeGroup




FileNotFoundError: [Errno 2] No such file or directory: 'config/config.yml'

In [None]:
FE = FeatureEngineer(load_dataset=True)

# Instanciate MLPipeline
pipeline = MLPipeline()

# Prepare ML datasets
pipeline.prepare_datasets(
    ml_df=FE.df.copy(),
    train_test_ratio=Params.train_test_ratio
)

# Instanciate ModelRegistry
model_registry = ModelRegistry(
    load_from_local_registry=Params.local_registry
)

model = model_registry.dev_models[0]

explainer = shap.TreeExplainer(model.model)

# Calculate shap values
model.shap_values: np.ndarray = explainer.shap_values(pipeline.X_test)