In [None]:
#pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import wandb
import os
from dotenv import load_dotenv
load_dotenv()

WANDB_API_KEY=os.environ.get('WANDB_API_KEY')
wandb.login(key=WANDB_API_KEY)

[34m[1mwandb[0m: Currently logged in as: [33mhangtn13-ssc-national-economics-university[0m (use `wandb login --relogin` to force relogin)


True

In [12]:
run = wandb.init(project="Bank-Marketing", job_type="data")

# Log artifact
artifact = wandb.Artifact("raw_data", type="dataset")
artifact.add_file("bank-additional-full.csv")
run.log_artifact(artifact)

run.finish()




VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [22]:
%%file test_data.py
import pytest
import wandb
import pandas as pd

# This is global so all tests are collected under the same run
run = wandb.init(project="Bank-Marketing", job_type="data_checks")

@pytest.fixture(scope="session")
def data():
    local_path = run.use_artifact("Bank-Marketing/raw_data:latest").file()
    df = pd.read_csv(local_path, delimiter=";")
    return df

def test_data_length(data):
    """
    We test that we have enough data to continue
    """
    assert len(data) > 1000

def test_number_of_columns(data):
    """
    We test that we have the correct number of columns
    """
    assert data.shape[1] == 21, f"Expected 21 columns, but got {data.shape[1]}"

def test_column_presence_and_type(data):
    required_columns = {
        "age": pd.api.types.is_int64_dtype,
        "job": pd.api.types.is_object_dtype,
        "marital": pd.api.types.is_object_dtype,
        "education": pd.api.types.is_object_dtype,
        "default": pd.api.types.is_object_dtype,
        "housing": pd.api.types.is_object_dtype,
        "loan": pd.api.types.is_object_dtype,
        "contact": pd.api.types.is_object_dtype,
        "month": pd.api.types.is_object_dtype,
        "day_of_week": pd.api.types.is_object_dtype,
        "duration": pd.api.types.is_int64_dtype,
        "campaign": pd.api.types.is_int64_dtype,
        "pdays": pd.api.types.is_int64_dtype,
        "previous": pd.api.types.is_int64_dtype,
        "poutcome": pd.api.types.is_object_dtype,
        "emp.var.rate": pd.api.types.is_float_dtype,
        "cons.price.idx": pd.api.types.is_float_dtype,
        "cons.conf.idx": pd.api.types.is_float_dtype,
        "euribor3m": pd.api.types.is_float_dtype,
        "nr.employed": pd.api.types.is_float_dtype,
        "y": pd.api.types.is_object_dtype
    }

    # Check column presence
    assert set(data.columns.values).issuperset(set(required_columns.keys()))

    for col_name, format_verification_funct in required_columns.items():
        assert format_verification_funct(data[col_name]), f"Column {col_name} failed test {format_verification_funct}"

def test_class_names(data):
    # Check that only the known classes are present for binary columns
    known_classes = ["yes", "no"]

    assert data["default"].isin(known_classes).all()
    assert data["housing"].isin(known_classes).all()
    assert data["loan"].isin(known_classes).all()
    assert data["y"].isin(known_classes).all()

def test_categorical_values(data):
    # Check that categorical columns only contain allowed values
    job_categories = ['admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown']
    marital_status = ['unknown','married', 'divorced', 'single']
    education_levels = ['basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown']
    contact_types = ['telephone', 'cellular']
    month_values = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    poutcome_values = ['nonexistent', 'failure', 'success']
    day_of_week_values = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

    assert data["job"].isin(job_categories).all()
    assert data["marital"].isin(marital_status).all()
    assert data["education"].isin(education_levels).all()
    assert data["contact"].isin(contact_types).all()
    assert data["month"].isin(month_values).all()
    assert data["poutcome"].isin(poutcome_values).all()
    assert data["day_of_week"].isin(day_of_week_values).all()

def test_column_ranges(data):
    # Add ranges for columns
    ranges = {
        "age": (0, 100),
        "duration": (0, 5000),  # Assuming the duration can be up to 5000 seconds
        "campaign": (0, 100),
        "pdays": (-1, 1000),  # -1 means client was not contacted previously
        "previous": (0, 100),
        "emp.var.rate": (-10, 10),  # Giả sử tỷ lệ biến động trong phạm vi này
        "cons.price.idx": (90, 100),  # Giả sử CPI dao động trong phạm vi này
        "cons.conf.idx": (-50, 50),  # Chỉ số niềm tin tiêu dùng thường trong phạm vi này
        "euribor3m": (0, 6),  # Lãi suất giữa các ngân hàng thường trong phạm vi này
        "nr.employed": (0, 10000),  # Số người có việc làm 
    }

    for col_name, (minimum, maximum) in ranges.items():
        assert data[col_name].dropna().between(minimum, maximum).all(), (
            f"Column {col_name} failed the test. Should be between {minimum} and {maximum}, "
            f"instead min={data[col_name].min()} and max={data[col_name].max()}"
        )

Overwriting test_data.py


In [23]:
!pytest . -vv

platform win32 -- Python 3.8.20, pytest-6.2.5, py-1.11.0, pluggy-1.5.0 -- C:\Users\dell\miniconda3\envs\bankmkt\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\dell\Downloads\Mlops_Project_K64
plugins: anyio-3.7.1
[1mcollecting ... [0mcollected 6 items

test_data.py::test_data_length [32mPASSED[0m[32m                                    [ 16%][0m
test_data.py::test_number_of_columns [32mPASSED[0m[32m                              [ 33%][0m
test_data.py::test_column_presence_and_type [32mPASSED[0m[32m                       [ 50%][0m
test_data.py::test_class_names [31mFAILED[0m[31m                                    [ 66%][0m
test_data.py::test_categorical_values [32mPASSED[0m[31m                             [ 83%][0m
test_data.py::test_column_ranges [31mFAILED[0m[31m                                  [100%][0m

[31m[1m______________________________ test_class_names _______________________________[0m

data =        age          job  marital  ... euribor3m n

In [25]:
# close the run
# waiting a while after run the previous cell before execute this
run.finish()