# Modin: Pandas compatibile e benchmark leggeri

Installazione: `pip install -U modin[ray] ray pandas numpy`

In [None]:

import pandas as pd
import modin.pandas as mpd
import time, numpy as np, os, ray

# Esempio: piccolo CSV
p_df = pd.read_csv("../data/employees.csv")
print("Pandas groupby mean:")
print(p_df[["Level","Salary"]].groupby("Level").mean().round())

m_df = mpd.read_csv("../data/employees.csv")
print("\nModin groupby mean:")
print(m_df[["Level","Salary"]].groupby("Level").mean().round())


### Benchmark rapido lettura+groupby su CSV generato (ridimensionabile)

In [None]:

import numpy as np, pandas as pd, modin.pandas as mpd, time, ray, os
rows = 2_000_000  # aumenta se hai RAM
fn = "large_dataset_demo.csv"
if not os.path.exists(fn):
    np.random.seed(42)
    df = pd.DataFrame({
        "id": np.arange(1, rows+1),
        "level": np.random.choice(list("ABC"), size=rows),
        "salary": np.random.randint(30_000, 120_000, size=rows)
    })
    df.to_csv(fn, index=False)

t0=time.time(); dfp = pd.read_csv(fn); r1 = dfp.groupby("level")["salary"].mean(); t1=time.time()
print(f"Pandas: {t1-t0:.2f}s ->\n{r1}")

if not ray.is_initialized():
    ray.init(ignore_reinit_error=True)
t2=time.time(); dfm = mpd.read_csv(fn); r2 = dfm.groupby("level")["salary"].mean(); t3=time.time()
print(f"Modin(Ray): {t3-t2:.2f}s ->\n{r2}")


### Mini preprocess distribuito (esempio)

In [None]:

import modin.pandas as mpd
meta = mpd.read_csv("../data/fashion_metadata.csv")
meta["label_normalized"] = meta["label"] / meta["label"].max()
print(meta)
