In [2]:
#@title 🔍 Environment Validation (run before starting lab)
import torch, os

print("---- Environment Validation ----")

# Check GPU availability
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

# Check required env vars
for var in ["HF_TOKEN", "HF_USER", "SPACE_NAME"]:
    val = os.environ.get(var)
    print(f"{var}:", " SET" if val else "NOT SET")

# Check if W&B API key configured (login handled separately)
wandb_key = os.environ.get("WANDB_API_KEY")
print("WANDB_API_KEY:", " SET" if wandb_key else "⚠️ Not set (you'll log in interactively)")

# Check if dependencies installed
try:
    import wandb, gradio, huggingface_hub
    print("Dependencies:  wandb, gradio, huggingface_hub imported successfully")
except Exception as e:
    print("Dependencies: missing - please run install cell first")
    print(e)

print("---------------------------------")


---- Environment Validation ----
GPU available: False
HF_TOKEN: NOT SET
HF_USER: NOT SET
SPACE_NAME: NOT SET
WANDB_API_KEY: ⚠️ Not set (you'll log in interactively)
Dependencies:  wandb, gradio, huggingface_hub imported successfully
---------------------------------


In [2]:
!pip install -q torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install -q wandb gradio huggingface_hub git-lfs

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m159.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m123.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.1/108.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.2/208.2 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
#@title 3) Securely set your Hugging Face token, username, and desired Space name
from getpass import getpass
import os

print("Paste your Hugging Face token when prompted. It will be hidden.")
hf_token = getpass("Hugging Face token: ")
os.environ['HF_TOKEN'] = hf_token

# Edit these values (do NOT put the token here)
hf_user = input("Enter your Hugging Face username (e.g. 'alice'): ").strip()
space_name = input("Enter desired Space name (e.g. 'cifar100-demo-space'): ").strip()

os.environ['HF_USER'] = hf_user
os.environ['SPACE_NAME'] = space_name

print("HF_TOKEN stored in runtime (hidden). HF_USER and SPACE_NAME saved in environment variables.")

Paste your Hugging Face token when prompted. It will be hidden.
Hugging Face token: ··········
Enter your Hugging Face username (e.g. 'alice'): Umamahesh1226
Enter desired Space name (e.g. 'cifar100-demo-space'): week_10
HF_TOKEN stored in runtime (hidden). HF_USER and SPACE_NAME saved in environment variables.


In [4]:
#@title 4) Authenticate Weights & Biases (W&B)
import wandb
print("Follow the prompt to authenticate W&B (this opens an input box).")
wandb.login()


Follow the prompt to authenticate W&B (this opens an input box).


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33muma_mahesh_iitpkd[0m ([33muma_mahesh_iitpkd-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
%%bash
cat > train.py <<'PY'
import argparse
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import wandb
from torch.utils.data import DataLoader
from torchvision.models import resnet18

def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--project", type=str, default="cifar100-hf-demo")
    p.add_argument("--entity", type=str, default=None)
    p.add_argument("--epochs", type=int, default=5)
    p.add_argument("--batch-size", type=int, default=128)
    p.add_argument("--lr", type=float, default=0.01)
    return p.parse_args()

def get_dataloaders(batch_size):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4865, 0.4409),
                             (0.2673, 0.2564, 0.2762)),
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4865, 0.4409),
                             (0.2673, 0.2564, 0.2762)),
    ])
    trainset = torchvision.datasets.CIFAR100(root="./data", train=True, download=True, transform=transform_train)
    testset  = torchvision.datasets.CIFAR100(root="./data", train=False, download=True, transform=transform_test)
    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
    testloader  = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
    return trainloader, testloader

def train_one_epoch(model, device, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for i, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    return running_loss / total, 100. * correct / total

def evaluate(model, device, loader, criterion):
    model.eval()
    loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            l = criterion(outputs, targets)
            loss += l.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    return loss/total, 100.*correct/total

def main():
    args = parse_args()
    wandb.init(project=args.project, entity=args.entity, config=vars(args))
    cfg = wandb.config

    device = "cuda" if torch.cuda.is_available() else "cpu"
    trainloader, testloader = get_dataloaders(cfg.batch_size)

    model = resnet18(num_classes=100)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=5e-4)

    best_acc = 0.0
    for epoch in range(cfg.epochs):
        train_loss, train_acc = train_one_epoch(model, device, trainloader, optimizer, criterion)
        test_loss, test_acc = evaluate(model, device, testloader, criterion)
        wandb.log({"epoch": epoch+1, "train_loss": train_loss, "train_acc": train_acc,
                   "test_loss": test_loss, "test_acc": test_acc})
        print(f"Epoch {epoch+1}: train_acc={train_acc:.2f} test_acc={test_acc:.2f}")
        if test_acc > best_acc:
            best_acc = test_acc
            os.makedirs("outputs", exist_ok=True)
            torch.save(model.state_dict(), "outputs/model.pt")
            # log artifact
            artifact = wandb.Artifact("resnet18-cifar100", type="model", metadata={"test_acc": best_acc})
            artifact.add_file("outputs/model.pt")
            wandb.log_artifact(artifact)
    print("Best test acc:", best_acc)

if __name__ == "__main__":
    main()
PY


In [12]:
!python train.py --project cifar100-umamahesh --entity uma_mahesh_iitpkd-indian-institute-of-technology --epochs 3 --batch-size 128


[34m[1mwandb[0m: Currently logged in as: [33muma_mahesh_iitpkd[0m ([33muma_mahesh_iitpkd-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m⢿[0m Waiting for wandb.init()...
[34m[1mwandb[0m: [38;5;178m⣻[0m Waiting for wandb.init()...
[34m[1mwandb[0m: [38;5;178m⣽[0m Waiting for wandb.init()...
[34m[1mwandb[0m: [38;5;178m⣾[0m setting up run gv9yj6au (0.3s)
[34m[1mwandb[0m: [38;5;178m⣷[0m setting up run gv9yj6au (0.3s)
[34m[1mwandb[0m: Tracking run with wandb version 0.22.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20251022_153343-gv9yj6au[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mpleasant-paper-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/uma_mahesh_iitpkd-indian-institute-of-technology/cifar100-umamahesh[0m
[34m[1mwandb[0m: 🚀 

In [5]:
%%bash
python - <<'PY'
import wandb, os, sys
ENTITY = os.environ.get("uma_mahesh_iitpkd-indian-institute-of-technology") or "uma_mahesh_iitpkd-indian-institute-of-technology"   # <-- edit if not set
PROJECT = "cifar100-umamahesh"
ARTIFACT = "resnet18-cifar100:latest"
api = wandb.Api()
try:
    artifact = api.artifact(f"{ENTITY}/{PROJECT}/{ARTIFACT}")
    artifact.download(root="outputs")
    print("Downloaded artifact to outputs/")
except Exception as e:
    print("Failed to download artifact:", e)
    sys.exit(1)
PY


Downloaded artifact to outputs/


wandb:   1 of 1 files downloaded.  


In [16]:
%%bash
cat > app.py <<'PY'
import os, time, io
from PIL import Image
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet18
import gradio as gr

MODEL_PATH = "outputs/model.pt"

# If model not present, try download via W&B (requires WANDB_API_KEY secret in Space or env)
if not os.path.exists(MODEL_PATH):
    try:
        import wandb
        wandb_api_key = os.environ.get("WANDB_API_KEY")
        if wandb_api_key:
            wandb.login(key=wandb_api_key)
            api = wandb.Api()
            artifact = api.artifact(os.environ.get("WANDB_ARTIFACT", "uma_mahesh_iitpkd-indian-institute-of-technology/cifar100-umamahesh/resnet18-cifar100:latest"))
            artifact.download(root="outputs")
            print("Downloaded model via W&B artifact.")
        else:
            print("WANDB_API_KEY not set; cannot download artifact.")
    except Exception as e:
        print("Error downloading artifact via W&B:", e)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = resnet18(num_classes=100)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

transform = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4865, 0.4409),(0.2673,0.2564,0.2762))
])

def predict_image(img):
    start = time.time()
    x = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        out = model(x)
        probs = torch.nn.functional.softmax(out, dim=1)
        conf, idx = probs.max(1)
        class_idx = int(idx.item())
        conf_val = float(conf.item())
    latency = (time.time() - start) * 1000.0
    return {"class_idx": class_idx, "confidence": round(conf_val,4), "latency_ms": round(latency,2)}

iface = gr.Interface(fn=predict_image, inputs=gr.Image(type="pil"), outputs="json", title="CIFAR-100 demo")
if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)
PY


In [17]:
%%bash
cat > requirements.txt <<'REQ'
torch
torchvision
gradio
Pillow
wandb
huggingface_hub
git-lfs
REQ


In [18]:
os.environ['SPACE_NAME']="mlops_week10"

In [19]:
%%bash
set -e
# prepare local repo
rm -rf hf_space || true
mkdir hf_space
cp app.py requirements.txt hf_space/
cd hf_space

git init
git config user.email "142502018@smail.iitpkd.ac.in"
git config user.name "UmaMaheswarReddy-IIT-Pkd"
git lfs install

python - <<'PY'
from huggingface_hub import HfApi, Repository
import os, sys
token = os.environ.get("HF_TOKEN")
user = os.environ.get("HF_USER")
space = os.environ.get("SPACE_NAME")
if not token or not user or not space:
    print("HF_TOKEN, HF_USER or SPACE_NAME not set. Aborting.")
    sys.exit(1)
api = HfApi(token=token)

repo_id = f"{user}/{space}"
repo_url = api.create_repo(repo_id=repo_id, repo_type="space",
            space_sdk="gradio",
            exist_ok=True)
print("Repo URL:", repo_url)

api.upload_folder(
    folder_path=".",
    repo_id=repo_id,
    repo_type="space",
    commit_message="Initial commit: CIFAR-100 Gradio app (no model)"
)

print("Pushed to:", repo_url)
PY


Initialized empty Git repository in /content/hf_space/.git/
Updated git hooks.
Git LFS initialized.
Repo URL: https://huggingface.co/spaces/Umamahesh1226/mlops_week10
Pushed to: https://huggingface.co/spaces/Umamahesh1226/mlops_week10


hint: Using 'master' as the name for the initial branch. This default branch name
hint: is subject to change. To configure the initial branch name to use in all
hint: 
hint: 	git config --global init.defaultBranch <name>
hint: 
hint: Names commonly chosen instead of 'master' are 'main', 'trunk' and
hint: 'development'. The just-created branch can be renamed via this command:
hint: 
hint: 	git branch -m <name>
