In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!unzip -o /content/drive/MyDrive/data.zip -d /content/data


Archive:  /content/drive/MyDrive/data.zip
  inflating: /content/data/data/images/10000.jpg  
  inflating: /content/data/data/images/10001.jpg  
  inflating: /content/data/data/images/10002.jpg  
  inflating: /content/data/data/images/10003.jpg  
  inflating: /content/data/data/images/10004.jpg  
  inflating: /content/data/data/images/10005.jpg  
  inflating: /content/data/data/images/10006.jpg  
  inflating: /content/data/data/images/10007.jpg  
  inflating: /content/data/data/images/10008.jpg  
  inflating: /content/data/data/images/10009.jpg  
  inflating: /content/data/data/images/10010.jpg  
  inflating: /content/data/data/images/10011.jpg  
  inflating: /content/data/data/images/10012.jpg  
  inflating: /content/data/data/images/10013.jpg  
  inflating: /content/data/data/images/10014.jpg  
  inflating: /content/data/data/images/10015.jpg  
  inflating: /content/data/data/images/10016.jpg  
  inflating: /content/data/data/images/10017.jpg  
  inflating: /content/data/data/images/1

In [26]:
import os
import pandas as pd
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim

# Dataset class
class RefundDataset(Dataset):
    def __init__(self, csv_file, image_folder, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.transform = transform
        self.label_encoder = LabelEncoder()
        self.data['label'] = self.label_encoder.fit_transform(self.data['articleType'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_id = str(int(self.data.iloc[idx]['id'])) + ".jpg"
        img_path = os.path.join(self.image_folder, img_id)
        image = Image.open(img_path).convert("RGB")
        label = self.data.iloc[idx]['label']
        if self.transform:
            image = self.transform(image)
        return image, label

# Basic augmentations only
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),  # <== REMOVE jitter, flip, rotation
])

# Full dataset
csv_path = "/content/data/data/refund_full_set/styles.csv"
image_path = "/content/data/data/refund_full_set/images"

dataset = RefundDataset(csv_path, image_path, transform)
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
val_loader = DataLoader(val_data, batch_size=8)

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, len(dataset.label_encoder.classes_))
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 15

# Train
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Loss: {running_loss / len(train_loader):.4f}")

# Validate
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")

# Save
torch.save(model.state_dict(), "/content/refund_classifier_final.pt")
print("Model saved as refund_classifier_final.pt")


Epoch 1/15
Loss: 1.7653
Epoch 2/15
Loss: 1.0903
Epoch 3/15
Loss: 0.9376
Epoch 4/15
Loss: 0.7326
Epoch 5/15
Loss: 0.5604
Epoch 6/15
Loss: 0.5078
Epoch 7/15
Loss: 0.4027
Epoch 8/15
Loss: 0.3861
Epoch 9/15
Loss: 0.3218
Epoch 10/15
Loss: 0.2497
Epoch 11/15
Loss: 0.1757
Epoch 12/15
Loss: 0.1925
Epoch 13/15
Loss: 0.1740
Epoch 14/15
Loss: 0.0818
Epoch 15/15
Loss: 0.0559
Validation Accuracy: 0.86
Model saved as refund_classifier_final.pt


In [39]:
import os
import pandas as pd
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim

# Dataset class
class RefundDataset(Dataset):
    def __init__(self, csv_file, image_folder, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.transform = transform
        self.label_encoder = LabelEncoder()
        self.data['label'] = self.label_encoder.fit_transform(self.data['articleType'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_id = str(int(self.data.iloc[idx]['id'])) + ".jpg"
        img_path = os.path.join(self.image_folder, img_id)
        image = Image.open(img_path).convert("RGB")
        label = self.data.iloc[idx]['label']
        if self.transform:
            image = self.transform(image)
        return image, label

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
])

# Paths for full dataset
csv_path = "/content/data/data/refund_full_set/styles.csv"
image_path = "/content/data/data/refund_full_set/images"

# Load dataset
dataset = RefundDataset(csv_path, image_path, transform)
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
val_loader = DataLoader(val_data, batch_size=8)

# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, len(dataset.label_encoder.classes_))
model.to(device)

# Training settings
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 15

# Training loop
for epoch in range(epochs):
    print(f"Starting Epoch {epoch+1}/{epochs}...")
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Validation — FIXED to print only once
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")

# Save model
torch.save(model.state_dict(), "/content/refund_classifier_full.pt")
print("Model saved as refund_classifier_full.pt")


Starting Epoch 1/15...
Epoch 1/15, Loss: 1.8483
Starting Epoch 2/15...
Epoch 2/15, Loss: 1.2875
Starting Epoch 3/15...
Epoch 3/15, Loss: 1.0435
Starting Epoch 4/15...
Epoch 4/15, Loss: 0.8329
Starting Epoch 5/15...
Epoch 5/15, Loss: 0.6787
Starting Epoch 6/15...
Epoch 6/15, Loss: 0.5810
Starting Epoch 7/15...
Epoch 7/15, Loss: 0.4642
Starting Epoch 8/15...
Epoch 8/15, Loss: 0.3709
Starting Epoch 9/15...
Epoch 9/15, Loss: 0.3323
Starting Epoch 10/15...
Epoch 10/15, Loss: 0.2805
Starting Epoch 11/15...
Epoch 11/15, Loss: 0.2083
Starting Epoch 12/15...
Epoch 12/15, Loss: 0.1738
Starting Epoch 13/15...
Epoch 13/15, Loss: 0.1491
Starting Epoch 14/15...
Epoch 14/15, Loss: 0.2381
Starting Epoch 15/15...
Epoch 15/15, Loss: 0.1285
Validation Accuracy: 0.82
Model saved as refund_classifier_full.pt


In [5]:
import os
import pandas as pd
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim

# Force errors to show up exactly where they occur
%env CUDA_LAUNCH_BLOCKING=1

# Paths
csv_path = "/content/data/data/refund_full_set/styles.csv"
image_path = "/content/data/data/refund_full_set/images"

# Load and filter styles
df = pd.read_csv(csv_path)
df = df.dropna(subset=["id", "articleType"])

# Remove entries with missing images
def image_exists(row):
    return os.path.exists(os.path.join(image_path, f"{int(row['id'])}.jpg"))
df = df[df.apply(image_exists, axis=1)]

# Remove rare classes (fewer than 2 samples)
class_counts = df['articleType'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df['articleType'].isin(valid_classes)].reset_index(drop=True)

# Rebuild label encoder on clean dataset
label_encoder = LabelEncoder()
df["encoded_label"] = label_encoder.fit_transform(df["articleType"])

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
])

# Stratified split
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(splitter.split(df, df["encoded_label"]))

train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

# 🔍 Manually check every image + label before training
bad_samples = []
max_allowed_label = len(label_encoder.classes_) - 1

for i in range(len(train_df)):
    try:
        row = train_df.iloc[i]
        img_path = os.path.join(image_path, f"{int(row['id'])}.jpg")
        label = int(row["encoded_label"])
        assert 0 <= label <= max_allowed_label, f"Invalid label {label}"
        img = Image.open(img_path).convert("RGB")
        img = transform(img)
    except Exception as e:
        bad_samples.append((i, str(e)))

print(f"✅ Checked {len(train_df)} training samples")
print(f"❌ Found {len(bad_samples)} corrupt samples")

if bad_samples:
    for i, msg in bad_samples[:5]:
        print(f" - Row {i}: {msg}")
if bad_samples:
    bad_indices = [i for i, _ in bad_samples]
    train_df = train_df.drop(train_df.index[bad_indices]).reset_index(drop=True)

# Dataset class
class RefundDataset(Dataset):
    def __init__(self, dataframe, image_folder, transform=None):
        self.data = dataframe
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_id = str(int(self.data.iloc[idx]["id"])) + ".jpg"
        img_path = os.path.join(self.image_folder, img_id)
        image = Image.open(img_path).convert("RGB")
        label = self.data.iloc[idx]["encoded_label"]
        if self.transform:
            image = self.transform(image)
        return image, label


# Dataloaders
train_dataset = RefundDataset(train_df, image_path, transform)
val_dataset = RefundDataset(val_df, image_path, transform)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

# Freeze backbone
for name, param in model.named_parameters():
    if "layer4" in name or "fc" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Update classifier head
num_classes = len(label_encoder.classes_)
model.fc = nn.Linear(model.fc.in_features, num_classes)

# Final CUDA-safe label check
max_label = max(df["encoded_label"])
assert max_label < num_classes, "Label index exceeds number of model outputs"

# Move model to device
model = model.to(device)

# Training config
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
epochs = 30

# Training
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
    print(f"Loss: {running_loss / len(train_loader):.4f}")

# Validation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")

# Save model
torch.save(model.state_dict(), "/content/refund_classifier_final.pt")
print("✅ Model saved as refund_classifier_final.pt")

env: CUDA_LAUNCH_BLOCKING=1
✅ Checked 1714 training samples
❌ Found 0 corrupt samples
Epoch 1/30
Loss: 1.9935
Epoch 2/30
Loss: 1.2668
Epoch 3/30
Loss: 1.0235
Epoch 4/30
Loss: 0.9017
Epoch 5/30
Loss: 0.8275
Epoch 6/30
Loss: 0.7304
Epoch 7/30
Loss: 0.6954
Epoch 8/30
Loss: 0.6323
Epoch 9/30
Loss: 0.6158
Epoch 10/30
Loss: 0.5664
Epoch 11/30
Loss: 0.5653
Epoch 12/30
Loss: 0.5302
Epoch 13/30
Loss: 0.5166
Epoch 14/30
Loss: 0.4985
Epoch 15/30
Loss: 0.5003
Epoch 16/30
Loss: 0.4508
Epoch 17/30
Loss: 0.4531
Epoch 18/30
Loss: 0.4126
Epoch 19/30
Loss: 0.3972
Epoch 20/30
Loss: 0.4403
Epoch 21/30
Loss: 0.4000
Epoch 22/30
Loss: 0.4074
Epoch 23/30
Loss: 0.3919
Epoch 24/30
Loss: 0.3837
Epoch 25/30
Loss: 0.3786
Epoch 26/30
Loss: 0.3739
Epoch 27/30
Loss: 0.3692
Epoch 28/30
Loss: 0.3579
Epoch 29/30
Loss: 0.3789
Epoch 30/30
Loss: 0.3494
Validation Accuracy: 0.84
✅ Model saved as refund_classifier_final.pt


In [6]:
# ------------------ IMPORTS ------------------
import os
import pandas as pd
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim

%env CUDA_LAUNCH_BLOCKING=1

# ------------------ PATHS ------------------
csv_path = "/content/data/data/refund_full_set/styles.csv"
image_path = "/content/data/data/refund_full_set/images"

# ------------------ LOAD & CLEAN DATA ------------------
df = pd.read_csv(csv_path)
df = df.dropna(subset=["id", "articleType"])

# Remove missing images
def image_exists(row):
    return os.path.exists(os.path.join(image_path, f"{int(row['id'])}.jpg"))
df = df[df.apply(image_exists, axis=1)]

# Remove rare classes (fewer than 2 samples)
class_counts = df['articleType'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df['articleType'].isin(valid_classes)].reset_index(drop=True)

# Encode labels
label_encoder = LabelEncoder()
df["encoded_label"] = label_encoder.fit_transform(df["articleType"])

# Stratified split
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(splitter.split(df, df["encoded_label"]))
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

# ------------------ DATASET ------------------
class RefundDataset(Dataset):
    def __init__(self, dataframe, image_folder, transform=None):
        self.data = dataframe
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_id = str(int(self.data.iloc[idx]["id"])) + ".jpg"
        img_path = os.path.join(self.image_folder, img_id)
        image = Image.open(img_path).convert("RGB")
        label = self.data.iloc[idx]["encoded_label"]
        if self.transform:
            image = self.transform(image)
        return image, label

# ------------------ TRANSFORMS ------------------
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
])

# ------------------ LOADERS ------------------
train_dataset = RefundDataset(train_df, image_path, transform)
val_dataset = RefundDataset(val_df, image_path, transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# ------------------ MODEL: RESNET50 ------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

# Unfreeze layer4 and fc
for name, param in model.named_parameters():
    if "layer4" in name or "fc" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Replace FC layer with dropout + classifier
num_classes = len(label_encoder.classes_)
model.fc = nn.Sequential(
    nn.Dropout(0.4),
    nn.Linear(model.fc.in_features, num_classes)
)
model.to(device)

# ------------------ LOSS FUNCTION WITH CLASS WEIGHTS ------------------
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["encoded_label"]),
    y=df["encoded_label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# ------------------ OPTIMIZER + SCHEDULER ------------------
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
epochs = 30

# ------------------ TRAINING ------------------
import mlflow
import mlflow.pytorch

# Start MLflow run
mlflow.set_tracking_uri("file:///content/mlruns")  # local path for Colab
mlflow.set_experiment("RefundClassifierResNet50")

with mlflow.start_run():
    mlflow.log_params({
        "model": "resnet50",
        "batch_size": 16,
        "epochs": epochs,
        "learning_rate": 0.01,
        "optimizer": "SGD",
        "scheduler_step": 5,
        "dropout": 0.4,
        "augment": "flip, rotate, colorjitter",
        "num_classes": num_classes
    })

    # Training loop
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        scheduler.step()
        avg_loss = running_loss / len(train_loader)
        mlflow.log_metric("train_loss", avg_loss, step=epoch)
        print(f"Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    mlflow.log_metric("val_accuracy", accuracy)
    print(f"Validation Accuracy: {accuracy:.2f}")

    # Save model in MLflow
    mlflow.pytorch.log_model(model, "model")
    torch.save(model.state_dict(), "refund_classifier_final.pt")
    print("Model saved as refund_classifier_final.pt")

# ------------------ VALIDATION ------------------
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"✅ Final Validation Accuracy: {accuracy:.2f}")

# ------------------ SAVE MODEL ------------------
torch.save(model.state_dict(), "/content/refund_classifier_final.pt")
print("✅ Model saved as refund_classifier_final.pt")

env: CUDA_LAUNCH_BLOCKING=1
Epoch 1/30
Loss: 2.9622
Epoch 2/30
Loss: 1.7788
Epoch 3/30
Loss: 1.2387
Epoch 4/30
Loss: 0.7934
Epoch 5/30
Loss: 0.5209
Epoch 6/30
Loss: 0.3900
Epoch 7/30
Loss: 0.3441
Epoch 8/30
Loss: 0.3140
Epoch 9/30
Loss: 0.2394
Epoch 10/30
Loss: 0.1640
Epoch 11/30
Loss: 0.1493
Epoch 12/30
Loss: 0.1435
Epoch 13/30
Loss: 0.1156
Epoch 14/30
Loss: 0.1365
Epoch 15/30
Loss: 0.1423
Epoch 16/30
Loss: 0.1021
Epoch 17/30
Loss: 0.0950
Epoch 18/30
Loss: 0.0950
Epoch 19/30
Loss: 0.0830
Epoch 20/30
Loss: 0.0929
Epoch 21/30
Loss: 0.0918
Epoch 22/30
Loss: 0.0691
Epoch 23/30
Loss: 0.0701
Epoch 24/30
Loss: 0.0841
Epoch 25/30
Loss: 0.0733
Epoch 26/30
Loss: 0.1057
Epoch 27/30
Loss: 0.0585
Epoch 28/30
Loss: 0.0749
Epoch 29/30
Loss: 0.0657
Epoch 30/30
Loss: 0.0726
✅ Final Validation Accuracy: 0.86
✅ Model saved as refund_classifier_final.pt


In [8]:
%run train_model_full_set.py

Exception: File `'train_model_full_set.py'` not found.

In [56]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [7]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentele

In [20]:
from google.colab import files
uploaded = files.upload()

Saving data.zip to data (2).zip


In [21]:
import zipfile
with zipfile.ZipFile("/content/data.zip", 'r') as zip_ref:
    zip_ref.extractall("/content")

In [22]:
import os

# List uploaded files and show sizes
for f in uploaded.keys():
    if os.path.exists(f):
        size = os.path.getsize(f)
        print(f"{f} uploaded — {size / 1024:.1f} KB")
    else:
        print(f"{f} not found")

data (2).zip uploaded — 17256.8 KB


In [23]:
!ls -R /content

/content:
 data		'data (2).zip'	 sample_data
'data (1).zip'	 data.zip	 train_model_full_set.py

/content/data:
data  images  refund_full_set  refund_subset  styles.csv

/content/data/data:
images	refund_full_set  refund_subset	styles.csv

/content/data/data/images:
10000.jpg  10343.jpg  10731.jpg  11161.jpg  11519.jpg  11857.jpg  12178.jpg
10001.jpg  10344.jpg  10732.jpg  11162.jpg  11520.jpg  11858.jpg  12179.jpg
10002.jpg  10345.jpg  10733.jpg  11163.jpg  11521.jpg  11859.jpg  12180.jpg
10003.jpg  10346.jpg  10734.jpg  11164.jpg  11522.jpg  11860.jpg  12181.jpg
10004.jpg  10347.jpg  10735.jpg  11165.jpg  11523.jpg  11861.jpg  12182.jpg
10005.jpg  10348.jpg  10736.jpg  11166.jpg  11524.jpg  11862.jpg  12183.jpg
10006.jpg  10350.jpg  10737.jpg  11167.jpg  11525.jpg  11863.jpg  12184.jpg
10007.jpg  10351.jpg  10738.jpg  11168.jpg  11526.jpg  11864.jpg  12185.jpg
10008.jpg  10352.jpg  10739.jpg  11169.jpg  11527.jpg  11865.jpg  12186.jpg
10009.jpg  10354.jpg  10740.jpg  11170.jpg  11529.

In [24]:
!python3 train_model_full_set.py

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100% 97.8M/97.8M [00:00<00:00, 211MB/s]
Epoch 1/30
Loss: 2.8921
Epoch 2/30
Loss: 1.7625
Epoch 3/30
Loss: 1.2199
Epoch 4/30
Loss: 1.0870
Epoch 5/30
Loss: 0.7440
Epoch 6/30
Loss: 0.5276
Epoch 7/30
Loss: 0.3391
Epoch 8/30
Loss: 0.3610
Epoch 9/30
Loss: 0.2646
Epoch 10/30
Loss: 0.2630
Epoch 11/30
Loss: 0.1728
Epoch 12/30
Loss: 0.1847
Epoch 13/30
Loss: 0.1597
Epoch 14/30
Loss: 0.1534
Epoch 15/30
Loss: 0.1743
Epoch 16/30
Loss: 0.1209
Epoch 17/30
Loss: 0.1374
Epoch 18/30
Loss: 0.1182
Epoch 19/30
Loss: 0.1016
Epoch 20/30
Loss: 0.1111
Epoch 21/30
Loss: 0.0921
Epoch 22/30
Loss: 0.0923
Epoch 23/30
Loss: 0.1093
Epoch 24/30
Loss: 0.0949
Epoch 25/30
Loss: 0.0914
Epoch 26/30
Loss: 0.0982
Epoch 27/30
Loss: 0.1165
Epoch 28/30
Loss: 0.0961
Epoch 29/30
Loss: 0.1060
Epoch 30/30
Loss: 0.1072
Validation Accuracy: 0.86
Model saved as refund_classifier_final.pt


In [26]:
from google.colab import files
files.download("refund_classifier_final.pt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
from datetime import datetime

# Your project folder in Drive (adjust if needed)
project_path = "/content/drive/MyDrive/Colab Notebooks"
output_file = os.path.join(project_path, "project_metadata.json")

metadata = []

for root, _, files in os.walk(project_path):
    for file in files:
        if file.endswith((".py", ".ipynb", ".pt", ".json", ".txt")):
            full_path = os.path.join(root, file)
            # Try to preview the start of the file if it's text-based
            try:
                with open(full_path, "r", encoding="utf-8") as f:
                    preview = f.read(400)  # Get first ~15 lines
            except:
                preview = "[Binary or unreadable file]"

            metadata.append({
                "file": file,
                "relative_path": os.path.relpath(full_path, project_path),
                "size_kb": round(os.path.getsize(full_path) / 1024, 2),
                "last_modified": datetime.fromtimestamp(os.path.getmtime(full_path)).isoformat(),
                "preview": preview
            })

# Save metadata to Drive
with open(output_file, "w") as f:
    json.dump(metadata, f, indent=2)

print("Full metadata with previews saved to:")
print(output_file)


✅ Full metadata with previews saved to:
/content/drive/MyDrive/Colab Notebooks/project_metadata.json
