# MLOps Data Ingestion
## Credit Risk Classification ‚Äî Analysis of the South German Credit Dataset




#Libraries

In [1]:
# =============================================================
# Import-capture robusto + listado SOLO de lo que importas t√∫
# =============================================================
import builtins, sys
from importlib import import_module
from importlib.metadata import packages_distributions, version, PackageNotFoundError

_CAPTURE_ACTIVE = False
_CAPTURED = set()
_ORIG_IMPORT = builtins.__import__

# alias √∫tiles m√≥dulo -> distribuci√≥n PyPI
_ALIAS = {
    "sklearn": "scikit-learn",
    "PIL": "Pillow",
}

# Ruido t√≠pico de IPython/Jupyter que no queremos
_IGNORE = {
    "IPython","matplotlib_inline","jupyter","jupyter_core","jupyter_client",
    "ipykernel","traitlets","prompt_toolkit","pygments","pexpect","pickleshare",
    "debugpy","parso","wcwidth","ptyprocess","backcall","decorator"
}

def _import_hook(name, globals=None, locals=None, fromlist=(), level=0):
    if _CAPTURE_ACTIVE:
        top = name.split('.')[0]
        # evita stdlib y ruido
        if hasattr(sys, "stdlib_module_names"):
            is_std = top in sys.stdlib_module_names
        else:
            # fallback muy simple para entornos sin stdlib_module_names
            is_std = top in {"sys", "os", "json", "re", "math", "time", "itertools", "functools"}
        if (not is_std) and (top not in _IGNORE):
            _CAPTURED.add(top)
    return _ORIG_IMPORT(name, globals, locals, fromlist, level)

def start_import_capture(reset=True):
    """Activa el hook de importaci√≥n. Usa reset=True para limpiar capturas previas."""
    global _CAPTURE_ACTIVE, _CAPTURED
    if reset:
        _CAPTURED = set()
    builtins.__import__ = _import_hook
    _CAPTURE_ACTIVE = True
    print("‚úÖ Import capture STARTED")

def stop_import_capture(show_requirements=True, include_deps=False):
    """Desactiva el hook y opcionalmente imprime requirements.
       include_deps=False -> solo los m√≥dulos que T√ö importaste expl√≠citamente.
       include_deps=True  -> intenta resolver distribuciones PyPI (puede a√±adir dependencias)."""
    global _CAPTURE_ACTIVE
    builtins.__import__ = _ORIG_IMPORT
    _CAPTURE_ACTIVE = False
    print("üõë Import capture STOPPED")
    if show_requirements:
        print("\n============================================")
        print("üîπ IMPORTED PACKAGES (requirements format)")
        print("============================================")
        _print_requirements(include_deps=include_deps)

def _safe_print_line(name, ver):
    if name and str(name).strip() and ver and str(ver).strip():
        print(f"{name}=={ver}")

def _print_requirements(include_deps=False):
    """Si include_deps=False: intenta __version__ del m√≥dulo (solo lo que importaste).
       Si include_deps=True: resuelve distribuci√≥n con importlib.metadata (m√°s ruidoso)."""
    mapping = packages_distributions()

    seen = set()
    for mod in sorted(_CAPTURED):
        if mod in seen:
            continue

        # 1) Solo lo importado expl√≠cito (recomendado: include_deps=False)
        if not include_deps:
            try:
                m = import_module(mod)
                ver = getattr(m, "__version__", None)
                # algunos modulos usan nombres distintos para la distribuci√≥n
                dist_name = _ALIAS.get(mod, mod)
                if ver:
                    _safe_print_line(dist_name, ver)
                    seen.add(mod)
                # si no tiene __version__, lo omitimos (evita ValueError)
            except Exception:
                pass
            continue

        # 2) Modo dependencias: resolver distribuci√≥n PyPI
        dist_names = mapping.get(mod, [])
        dist = _ALIAS.get(mod) or (dist_names[0] if dist_names else mod)

        # evitar nombres vac√≠os que causan ValueError
        if not dist or not str(dist).strip():
            continue

        try:
            ver = version(dist)
            _safe_print_line(dist, ver)
            seen.add(mod)
        except PackageNotFoundError:
            # intento final con el nombre del m√≥dulo
            try:
                ver = version(mod)
                _safe_print_line(mod, ver)
                seen.add(mod)
            except PackageNotFoundError:
                pass


In [2]:
start_import_capture()


‚úÖ Import capture STARTED


In [3]:
# --- Google Colab espec√≠ficos ---
from google.colab import userdata
from google.colab import drive

# --- Manejo de datos ---
import pandas as pd

#Dataset Preview

In [4]:
#IMPORTAMOS EL DATASET
drive.mount('/content/drive', force_remount=True)
FOLDER_ID = userdata.get('FOLDER_ID')
TARGET = f"/content/drive/.shortcut-targets-by-id/{FOLDER_ID}"
import os
os.chdir(TARGET)

Mounted at /content/drive


In [5]:
#VISUALIZAMOS EL DATASET
#Preview Dataset
df = pd.read_csv('trabajo_grupal_mlops/data/german_credit_modified.csv')
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019
laufkont,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,4.0,2.0,...,2.0,4.0,2.0,2.0,1.0,2.0,2.0,4.0,3.0,4.0
laufzeit,18.0,9.0,12.0,12.0,12.0,10.0,8.0,6.0,18.0,24.0,...,39.0,15.0,,30.0,9.0,42.0,36.0,12.0,18.0,36.0
moral,4.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,...,3.0,4.0,2.0,3.0,2.0,1.0,2.0,3.0,2.0,2.0
verw,2.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,3.0,,...,6.0,2.0,3.0,3.0,3.0,1.0,0.0,5.0,2.0,2.0
hoehe,1049.0,2799.0,841.0,2122.0,2171.0,2241.0,3398.0,1361.0,1098.0,3758.0,...,11760.0,1520.0,766.0,1919.0,1364.0,9283.0,14318.0,1555.0,3049.0,10974.0
sparkont,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,...,2.0,5.0,3.0,2.0,1.0,1.0,1.0,4.0,1.0,1.0
beszeit,2.0,3.0,4.0,3.0,3.0,2.0,4.0,2.0,1.0,1.0,...,4.0,5.0,3.0,2.0,4.0,1.0,5.0,5.0,2.0,1.0
rate,4.0,2.0,2.0,3.0,4.0,1.0,1.0,2.0,4.0,1.0,...,2.0,4.0,4.0,4.0,3.0,1.0,4.0,4.0,1.0,4.0
famges,2.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,...,3.0,3.0,3.0,3.0,3.0,invalid,3.0,3.0,2.0,2.0
buerge,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Save Clean DataFrame as Parquet
df.to_parquet('trabajo_grupal_mlops/data/01_df_data_preparation_01.parquet', index=False)

print("DataFrame saved successfully'")


DataFrame saved successfully'


In [7]:
stop_import_capture()


üõë Import capture STOPPED

üîπ IMPORTED PACKAGES (requirements format)
bottleneck==1.4.2
cloudpickle==3.1.1
dateutil==2.9.0.post0
numexpr==2.14.1
numpy==2.0.2
pandas==2.2.2
pyarrow==18.1.0
pytz==2025.2
zmq==26.2.1


In [8]:
# =============================================================
# System & Environment Information for Google Colab
# =============================================================
import sys
import subprocess

print("============================================")
print("üîπ PYTHON VERSION")
print("============================================")
!python --version

print("\n============================================")
print("üîπ PIP VERSION")
print("============================================")
!pip --version

üîπ PYTHON VERSION
Python 3.12.12

üîπ PIP VERSION
pip 24.1.2 from /usr/local/lib/python3.12/dist-packages/pip (python 3.12)
