In [6]:
import logging
import tempfile
import pandas as pd
import os
import wandb
from sklearn.model_selection import train_test_split

In [7]:
import os
from dotenv import load_dotenv
load_dotenv()

WANDB_API_KEY=os.environ.get('WANDB_API_KEY')

In [8]:
wandb.login(key=WANDB_API_KEY)



True

In [13]:
# global variables

# ratio used to split train and test data
test_size = 0.2

# seed used to reproduce purposes
seed = 41

# reference (column) to stratify the data
stratify = "y"

# name of the input artifact
artifact_input_name = "hangtn13-ssc-national-economics-university/Bank-Marketing/preprocessed_data.csv:v1"

# type of the artifact
artifact_type = "segregated_data"

In [15]:
# configure logging
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",
                    datefmt='%d-%m-%Y %H:%M:%S')

# reference for a logging obj
logger = logging.getLogger()

# initiate wandb project
run = wandb.init(project="Bank-Marketing", job_type="split_data")

logger.info("Downloading and reading artifact")
artifact = run.use_artifact(artifact_input_name)
artifact_dir = artifact.download()
artifact_path = os.path.join(artifact_dir, "preprocessed_data.csv")
df = pd.read_csv(artifact_path)
logger.info("Artifact downloaded")


# Split firstly in train/test, then we further divide the dataset to train and validation
logger.info("Splitting data into train and test")
splits = {}

splits["train"], splits["test"] = train_test_split(df,
                                                   test_size=test_size,
                                                   random_state=seed,
                                                   stratify=df[stratify])

# Save the artifacts. We use a temporary directory so we do not leave any trace behind
with tempfile.TemporaryDirectory() as tmp_dir:

    for split, df in splits.items():

        # Make the artifact name from the name of the split plus the provided root
        artifact_name = f"{split}.csv"

        # Get the path on disk within the temp directory
        temp_path = os.path.join(tmp_dir, artifact_name)

        logger.info(f"Uploading the {split} dataset to {artifact_name}")

        # Save then upload to W&B
        df.to_csv(temp_path,index=False)

        artifact = wandb.Artifact(name=artifact_name,
                                  type=artifact_type,
                                  description=f"{split} split of dataset {artifact_input_name}",
        )
        artifact.add_file(temp_path)

        logger.info("Logging artifact")
        run.log_artifact(artifact)

        # This waits for the artifact to be uploaded to W&B. If you
        # do not add this, the temp directory might be removed before
        # W&B had a chance to upload the datasets, and the upload
        # might fail
        artifact.wait()

04-05-2025 13:18:00 Downloading and reading artifact
[34m[1mwandb[0m:   1 of 1 files downloaded.  
04-05-2025 13:18:07 Artifact downloaded
04-05-2025 13:18:07 Splitting data into train and test
04-05-2025 13:18:07 Uploading the train dataset to train.csv
04-05-2025 13:18:08 Logging artifact
04-05-2025 13:18:17 Uploading the test dataset to test.csv
04-05-2025 13:18:17 Logging artifact


In [16]:
# close the run
# waiting a while after run the previous cell before execute this
run.finish()

In [None]:
#!git status


On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	artifacts/
	data_segaration.ipynb
	test_data.py
	wandb/run-20250418_214853-enlr4ppk/
	wandb/run-20250418_215008-tcnywwd8/
	wandb/run-20250418_215135-pmlfx63t/
	wandb/run-20250418_215231-c6fm9c1q/
	wandb/run-20250418_215340-bddnuqm3/
	wandb/run-20250418_215434-xiuc812h/
	wandb/run-20250418_215952-1anvs4ed/
	wandb/run-20250418_220254-4j1rkppk/
	wandb/run-20250418_221930-n4z6zqla/
	wandb/run-20250418_222305-vdronr0d/

nothing added to commit but untracked files present (use "git add" to track)


In [None]:
#!git add .

The file will have its original line endings in your working directory


In [None]:
#!git commit -m "Added all untracked files"

[main 31298c0] Added all untracked files
 63 files changed, 45311 insertions(+)
 create mode 100644 artifacts/raw_data-v0/bank-additional-full.csv
 create mode 100644 data_segaration.ipynb
 create mode 100644 test_data.py
 create mode 100644 wandb/run-20250418_214853-enlr4ppk/files/conda-environment.yaml
 create mode 100644 wandb/run-20250418_214853-enlr4ppk/files/config.yaml
 create mode 100644 wandb/run-20250418_214853-enlr4ppk/files/requirements.txt
 create mode 100644 wandb/run-20250418_214853-enlr4ppk/files/wandb-metadata.json
 create mode 100644 wandb/run-20250418_214853-enlr4ppk/files/wandb-summary.json
 create mode 100644 wandb/run-20250418_214853-enlr4ppk/run-enlr4ppk.wandb
 create mode 100644 wandb/run-20250418_215008-tcnywwd8/files/conda-environment.yaml
 create mode 100644 wandb/run-20250418_215008-tcnywwd8/files/config.yaml
 create mode 100644 wandb/run-20250418_215008-tcnywwd8/files/requirements.txt
 create mode 100644 wandb/run-20250418_215008-tcnywwd8/files/wandb-metada