In [1]:
from google.colab import drive
import os

# 1. 挂载你的 Google Drive 到 /content/drive
drive.mount('/content/drive')

# 2. 在 MyDrive 下创建 MLops 文件夹
mlops_path = '/content/drive/MyDrive/MLops'
os.makedirs(mlops_path, exist_ok=True)

print(f"已创建或确认存在目录：{mlops_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
已创建或确认存在目录：/content/drive/MyDrive/MLops


In [None]:
# We are using a newer version of tensorflow-privacy (0.9.0) that is compatible
# with the newer tensorflow-probability (0.22.1) and tensorflow (2.15.0).
!pip install --upgrade tensorflow==2.15.0 tensorflow-estimator==2.15.0 keras==2.15.0 tensorflow-privacy==0.9.0 tensorflow-probability==0.22.1 numpy==1.26.4

# IMPORTANT: This will automatically restart the Colab runtime after installation.
# The "Session crashed" message is expected and necessary.
import os
print("\n✅ Final installation of all specific versions complete. The runtime will now restart.")
os.kill(os.getpid(), 9)



In [1]:

from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy

In [2]:
# 1. 切到项目根
%cd /content/drive/MyDrive/MLops

# 2. 初始化 Git & 安装 Git LFS
!git init -q
!apt-get update -y -qq && apt-get install git-lfs -y -qq
!git lfs install

# 3. 初始化 DVC（无 SCM 模式，用本地文件夹当 remote）
!pip install -q dvc
!dvc init --no-scm
!mkdir -p dvc_store
!dvc remote add -d local_remote /content/drive/MyDrive/MLops/dvc_store

# 4. v1：版本管理原始数据（Git LFS + DVC）
!mkdir -p data/raw data/clean models metrics
!mv athletes.csv data/raw/athletes.csv
!git lfs track "data/raw/athletes.csv"
!git add .gitattributes
!dvc add data/raw/athletes.csv
!git add data/raw/athletes.csv.dvc
!git commit -m "v1: track raw athletes.csv with Git LFS + DVC"

# 5. v2：清洗并版本管理清洗后数据（DVC）
!python clean.py data/raw/athletes.csv data/clean/athletes_clean.csv
!dvc add data/clean/athletes_clean.csv
!git add data/clean/athletes_clean.csv.dvc
!git commit -m "v2: track cleaned athletes.csv with DVC"

# 6. 拆分：v1/v2 版本下各自拆分 train/test
!python split.py data/raw/athletes.csv  data/raw/train.csv  data/raw/test.csv
!dvc add data/raw/train.csv data/raw/test.csv
!git add data/raw/train.csv.dvc data/raw/test.csv.dvc
!git commit -m "v1: split raw into train/test"

!python split.py data/clean/athletes_clean.csv data/clean/train.csv data/clean/test.csv
!dvc add data/clean/train.csv data/clean/test.csv
!git add data/clean/train.csv.dvc data/clean/test.csv.dvc
!git commit -m "v2: split clean into train/test"

# 7. 推送 DVC 存储
!dvc push

# 8. EDA、基线&DP模型
!python eda.py  data/raw/athletes.csv    v1
!python train.py data/raw/train.csv  data/raw/test.csv  models/rf_v1.pkl metrics/metrics_v1.json

!python eda.py  data/clean/athletes_clean.csv v2
!python train.py data/clean/train.csv data/clean/test.csv models/rf_v2.pkl metrics/metrics_v2.json

!pip install -q tensorflow-privacy
!python dp_train.py data/clean/train.csv data/clean/test.csv \
    metrics/dp_metrics_v2.json metrics/epsilon_v2.txt

# 9. 验证输出
!ls models
!ls metrics
!ls dvc_store

/content/drive/MyDrive/MLops
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Updated git hooks.
Git LFS initialized.
[31mERROR[39m: failed to initiate DVC - '.dvc' exists. Use `-f` to force.
[0mSetting 'local_remote' as a default remote.
[31mERROR[39m: configuration error - config file error: remote 'local_remote' already exists. Use `-f|--force` to overwrite it.
[0mmv: cannot stat 'athletes.csv': No such file or directory
"data/raw/athletes.csv" already supported
[?25l[32m⠋[0m Checking graph
Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
          |0.00 [00:00,     ?file/s][A
                                    [A
![A
  0% |          |0/? [00:00<?,    ?files/s][A
                                           [A
  0% 0/1 [00:00<?, ?files/s][A
  0% 0/1 [00:00<?, ?files/s{'info': ''}][A
100% 1/1 [00:00<00:00,  4.17files/s{'info

In [9]:

!pip install -q opacus==1.4.0 torch==2.2.2  # 若已装可跳过

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from opacus import PrivacyEngine

class CsvRegressionDataset(Dataset):
    def __init__(self, path, scaler=None, fit_scaler=False):
        df = pd.read_csv(path)

        if "total_lift" not in df.columns:
            df["total_lift"] = df[["deadlift", "candj", "snatch", "backsq"]].sum(axis=1)

        X = df[["age", "weight", "height", "backsq"]].astype(np.float32).values
        y = df["total_lift"].astype(np.float32).values.reshape(-1, 1)

        if fit_scaler:
            self.scaler = StandardScaler().fit(X)
        else:
            self.scaler = scaler
        self.X = self.scaler.transform(X)
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])

TRAIN_CSV = "/content/drive/MyDrive/MLops/data/clean/train.csv"
TEST_CSV  = "/content/drive/MyDrive/MLops/data/clean/test.csv"

train_ds = CsvRegressionDataset(TRAIN_CSV, fit_scaler=True)
test_ds  = CsvRegressionDataset(TEST_CSV , scaler=train_ds.scaler)

batch_size = 128
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader  = DataLoader(test_ds , batch_size=batch_size, shuffle=False)

class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(4).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.05)
criterion = nn.MSELoss(reduction="mean")

noise_multiplier = 1.1
max_grad_norm    = 1.0

privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private(
    module          = model,
    optimizer       = optimizer,
    data_loader     = train_loader,
    noise_multiplier= noise_multiplier,
    max_grad_norm   = max_grad_norm,
)

print(f" Model is now DP.  Noise={noise_multiplier}, Clip={max_grad_norm}")

# ---------- 5. 训练 ----------
epochs = 10
for epoch in range(1, epochs+1):
    model.train()
    cum_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        cum_loss += loss.item() * xb.size(0)
    print(f"Epoch {epoch}/{epochs}  |  train-MSE = {cum_loss/len(train_ds):.2f}")

model.eval()
with torch.no_grad():
    y_true, y_pred = [], []
    for xb, yb in test_loader:
        xb = xb.to(device)
        y_pred.append(model(xb).cpu())
        y_true.append(yb)
    y_true = torch.vstack(y_true).squeeze().numpy()
    y_pred = torch.vstack(y_pred).squeeze().numpy()

mae = mean_absolute_error(y_true, y_pred)
r2  = r2_score(y_true, y_pred)
print(f"\n  Test MAE = {mae:.2f} | R² = {r2:.3f}")


delta    = 1 / len(train_ds)
epsilon  = privacy_engine.get_epsilon(delta)
print(f"\n  DP guarantee: ε = {epsilon:.3f}  (δ = {delta:.2e})")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.8/224.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.6/755.6 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

  >>> model = MyCustomModel()


 Model is now DP.  Noise=1.1, Clip=1.0
Epoch 1/10  |  train-MSE = 272453.96
Epoch 2/10  |  train-MSE = 4839.33
Epoch 3/10  |  train-MSE = 4421.95
Epoch 4/10  |  train-MSE = 4452.00
Epoch 5/10  |  train-MSE = 4421.70
Epoch 6/10  |  train-MSE = 4300.91
Epoch 7/10  |  train-MSE = 4319.62
Epoch 8/10  |  train-MSE = 4385.31
Epoch 9/10  |  train-MSE = 4480.29
Epoch 10/10  |  train-MSE = 4399.36

🔎  Test MAE = 49.68 | R² = 0.943

  DP guarantee: ε = 0.947  (δ = 4.16e-05)
