In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.multiprocessing as mp
from FPN import Features, FPNetwork , classificationhead , bboxhead
from Loss import Lossfunction
from datasets import load_dataset
import torch.optim as optim
from torch.cuda.amp import  GradScaler
from torch.amp import autocast
import gc
from dataset_convert import AnchorGenerator, FaceDetectionDataset
# device = torch.device("mps")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
mp.set_start_method('spawn', force=True)

In [3]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)
model = model.features.to(device)

Using cache found in /Users/vipulagarwal/.cache/torch/hub/pytorch_vision_v0.10.0


In [4]:
device

device(type='cpu')

In [5]:
dataset = load_dataset("CUHK-CSE/wider_face")
train_dataset = dataset['train'].with_format("torch")
val_dataset = dataset['validation'].with_format("torch")

In [23]:
extractor = Features(model,['3','6', '13','18'])
topdown = FPNetwork(out_channels=256)
classifier = classificationhead(channels=256, num_anchors= 12, num_of_classes= 1)
bboxregression = bboxhead(channels= 256 , num_anchors= 12)
loss =Lossfunction(lambd=10)
anchors = AnchorGenerator()
data = FaceDetectionDataset(train_dataset,anchors)
vali_data = FaceDetectionDataset(val_dataset,anchors)

In [6]:
torch.cuda.empty_cache()
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Allocated: 0.03 GB
Cached: 0.05 GB


In [8]:
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Allocated: 0.03 GB
Cached: 0.05 GB


In [24]:
# batch_size = 16
# epochs = 20
# learning_rate = 1e-3
def forward(p):
    features = extractor.extract(p)
    newfeatures = topdown(features)
    output = {}
    for key in list(newfeatures.keys()):
        temp = {}
        temp["bbox"] = bboxregression(newfeatures[key])
        temp["cls"] = classifier(newfeatures[key])
        output[key] = temp
    return output

def train(epochs:int,training_data , validation_data=None):
    extractor.train()
    topdown.train()
    classifier.train()
    bboxregression.train()
    optimizer = optim.Adam(list(extractor.parameters())+
                           list(topdown.parameters())+
                           list(classifier.parameters())+
                           list(bboxregression.parameters()), lr=learning_rate)
    loSS = {}
    for i in range (0,epochs):
        train_loss= 0
        for key, (image,bbox) in enumerate(training_data):
            epoch_loss = {}
            model_pred = forward(image)
            optimizer.zero_grad()
            ll = loss(model_pred, bbox)
            train_loss+=ll
            if (key % 10 == 0):
                epoch_loss[key] = train_loss/10
                print(f"The avg loss for {key}th data is {train_loss/10}")
                train_loss = 0    
            ll.backward()
            optimizer.step()
        loSS[i] = epoch_loss
    return loSS

# train(epochs=epochs,training_data=training_data)


In [10]:
total_params=sum(p.numel() for p in extractor.parameters())
total_params+= sum(p.numel() for p in topdown.parameters())
total_params+= sum(p.numel() for p in classifier.parameters())
total_params+= sum(p.numel() for p in bboxregression.parameters())

print(total_params)

7454644


In [8]:
scaler = GradScaler()

  scaler = GradScaler()


In [25]:
def train_with_accumulation(epochs: int, training_data, validation_data=None, accumulation_steps=4):
    # Set models to training mode
    extractor.train()
    topdown.train()
    classifier.train()
    bboxregression.train()
    
    # Enable gradient checkpointing
    if hasattr(extractor, 'gradient_checkpointing_enable'):
        extractor.gradient_checkpointing_enable()
    if hasattr(topdown, 'gradient_checkpointing_enable'):
        topdown.gradient_checkpointing_enable()
    
    optimizer = optim.Adam(
        list(extractor.parameters()) +
        list(topdown.parameters()) +
        list(classifier.parameters()) +
        list(bboxregression.parameters()), 
        lr=learning_rate
    )
    
    loss_history = {}
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        batch_count = 0
        running_loss = 0.0
        
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 30)
        try:
            for batch_idx, (image, bbox) in enumerate(training_data):
                # Mixed precision forward pass
                optimizer.zero_grad()
                
                with autocast('cuda'):
                    model_pred = forward(image.cuda())
                    ll = loss(model_pred, bbox) / accumulation_steps  # Scale loss
                del image , model_pred , bbox
                # Backward pass
                scaler.scale(ll).backward()
                
                # Step optimizer every accumulation_steps
                if (batch_idx + 1) % accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                
                # Accumulate loss
                batch_loss = ll.item() * accumulation_steps  # Unscale for logging
                epoch_loss += batch_loss
                running_loss += batch_loss
                batch_count += 1
                
                # Print progress
                if (batch_idx + 1) % 10 == 0:
                    avg_running_loss = running_loss / 10
                    print("-" * 30)
                    print(f"Batch {batch_idx + 1}: Avg Loss = {avg_running_loss:.6f}")
                    print("-" * 30)
                    running_loss = 0.0
                
                # Memory 
                del ll
                torch.cuda.empty_cache()
                gc.collect()
        except Exception as e :
            print (e)
            print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
            torch.cuda.empty_cache()
            print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
            break

        except KeyboardInterrupt:
            print("INTERRUPTED!!")
            return loss_history   

                
        if batch_count % accumulation_steps != 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        avg_epoch_loss = epoch_loss / batch_count if batch_count > 0 else 0
        loss_history[epoch] = avg_epoch_loss
        
        print(f"Epoch {epoch + 1} Average Loss: {avg_epoch_loss:.6f}")
        
        # Cleanup at epoch end
        gc.collect()
        torch.cuda.empty_cache()
    
    return loss_history


In [26]:
torch.cuda.empty_cache()
batch_size = 8
epochs = 5
learning_rate = 1e-3
training_data = DataLoader(
    data,
    batch_size=batch_size, 
    num_workers=4,     # Reduce if GPU memory is full
    pin_memory=True,         # Faster CPU->GPU transfer
    persistent_workers=True, # Reuse workers across epochs
    prefetch_factor=4,       # Prefetch more batches
    drop_last=True,
    shuffle=True
)
validation_data = DataLoader(vali_data,
    batch_size=batch_size, 
    num_workers=4,     # Reduce if GPU memory is full
    pin_memory=True,         # Faster CPU->GPU transfer
    persistent_workers=True, # Reuse workers across epochs
    prefetch_factor=4,       # Prefetch more batches
    drop_last=True,
    shuffle=True
)

In [27]:
lossdata = train_with_accumulation(epochs = 1, training_data = training_data)


Epoch 1/1
------------------------------
box loss:3.5145514011383057
cls loss:3659.02685546875
box loss:2.7041258811950684
cls loss:2951.58447265625
box loss:2.1685640811920166
cls loss:13842.23046875
box loss:2.8785951137542725
cls loss:3645.6484375
box loss:2.2008285522460938
cls loss:2288.5556640625
box loss:6.365060806274414
cls loss:8771.6748046875
box loss:3.5769691467285156
cls loss:5418.03125
box loss:2.6229751110076904
cls loss:1269.349365234375
box loss:1.5934503078460693
cls loss:1104.1712646484375
box loss:1.130946397781372
cls loss:2296.52392578125
------------------------------
Batch 10: Avg Loss = 4553.435669
------------------------------
box loss:0.7891912460327148
cls loss:3107.992431640625
box loss:1.3655290603637695
cls loss:2027.667236328125
box loss:0.9878318905830383
cls loss:1096.649169921875
box loss:0.784155011177063
cls loss:4046.7470703125
box loss:4.194526195526123
cls loss:5035.4296875
box loss:1.036190152168274
cls loss:1054.8594970703125
box loss:1.1812

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f04cc302e10>>
Traceback (most recent call last):
  File "/home/faces2.0/.conda/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


box loss:0.662362277507782
cls loss:2507.304931640625
box loss:0.586776614189148
cls loss:1486.951904296875
box loss:0.4254346489906311
cls loss:1295.425537109375
box loss:1.0042507648468018
cls loss:2928.48291015625
------------------------------
Batch 140: Avg Loss = 2341.504181
------------------------------
box loss:0.4239314794540405
cls loss:1207.3135986328125
box loss:0.44571420550346375
cls loss:5920.3251953125
box loss:2.1272997856140137
cls loss:2449.819580078125
box loss:0.4631236791610718
cls loss:1984.474609375
DataLoader worker (pid(s) 3123, 3124, 3125, 3126) exited unexpectedly
Allocated: 1.85 GB
Cached: 2.57 GB
Allocated: 1.85 GB
Cached: 2.57 GB


In [40]:
loss = Lossfunction()
extractor.eval()
topdown.eval()
classifier.eval()
bboxregression.eval()
accumulation_steps = 4
for batch_idx, (image, bbox) in enumerate(validation_data):

    with autocast('cuda'):
        model_pred = forward(image.cuda())
        ll = loss(model_pred, bbox) / accumulation_steps  # Scale loss
    del image , model_pred , bbox
    print(ll)

tensor(0.2135, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2135, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2264, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2173, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2080, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2256, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2224, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2076, device='cuda:0', grad_fn=<DivBackward0>)


KeyboardInterrupt: 

In [19]:
torch.cuda.empty_cache()
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Allocated: 2.11 GB
Cached: 2.21 GB


In [20]:
data[1000][0].unsqueeze(0).shape

torch.Size([1, 3, 640, 640])

In [28]:
s=forward(data[100][0].unsqueeze(0).cuda())

In [29]:
t= data[100][1]

In [31]:
t[1]['bbox_targets'].shape

torch.Size([19200, 4])

In [19]:
(s['3']['bbox'].cpu()-t[3]['bbox_targets']).sum()


tensor(111445.5469, grad_fn=<SumBackward0>)

In [28]:
(s['3']['cls'].cpu()>0.55).sum()

tensor(613)

In [31]:
s['3']['cls'].sum()

tensor(163553.8438, device='cuda:0', grad_fn=<SumBackward0>)

In [30]:
t[0]['cls_targets'].sum()

tensor(0)

In [30]:
ls = loss(s,t)

box loss:0.19899043440818787
cls loss:78787.1875


In [35]:
ls

tensor(0.9902, device='cuda:0', grad_fn=<AddBackward0>)

In [26]:
s['18']['bbox'].squeeze(0).cpu()


tensor([[0.4933, 0.5469, 0.4821, 0.4534],
        [0.4507, 0.4666, 0.4542, 0.4803],
        [0.4810, 0.4796, 0.4658, 0.4742],
        ...,
        [0.4389, 0.0000, 0.4425, 0.0434],
        [0.4311, 0.0000, 0.4414, 0.0056],
        [0.4147, 0.0000, 0.3418, 0.0000]], grad_fn=<ToCopyBackward0>)

In [None]:

(data[1000][1][0]['bbox_targets'] - s['18']['bbox'].squeeze(0).cpu())


tensor(-0.1751, grad_fn=<MeanBackward0>)

In [80]:
model3 = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)
model3 = model3.features.to(device)

Using cache found in /home/.cache/torch/hub/pytorch_vision_v0.10.0


In [None]:
torch.all(list(extractor.parameters()) == list(model3.parameters()))

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [19]:
torch.save(extractor.state_dict(),'/home/faces2.0/models/extractor.pt')
torch.save(topdown.state_dict(),'/home/faces2.0/models/topdown.pt')
torch.save(classifier.state_dict(),'/home/faces2.0/models/classifier.pt')
torch.save(bboxregression.state_dict(),'/home/faces2.0/models/bboxregression.pt')

