In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.multiprocessing as mp
from FPN import Features, FPNetwork , classificationhead , bboxhead
from Loss import Lossfunction
from datasets import load_dataset
import torch.optim as optim
from torch.cuda.amp import  GradScaler
from torch.amp import autocast
import gc
from dataset_convert import AnchorGenerator, FaceDetectionDataset
# device = torch.device("mps")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mp.set_start_method('spawn', force=True)

In [3]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)
model = model.features.to(device)

Using cache found in /home/.cache/torch/hub/pytorch_vision_v0.10.0


In [28]:
device

device(type='cuda')

In [4]:
dataset = load_dataset("CUHK-CSE/wider_face")
train_dataset = dataset['train'].with_format("torch")
val_dataset = dataset['validation'].with_format("torch")

In [5]:
extractor = Features(model,['3','6', '13','18'])
topdown = FPNetwork(out_channels=256)
classifier = classificationhead(channels=256, num_anchors= 12, num_of_classes= 1)
bboxregression = bboxhead(channels= 256 , num_anchors= 12)
loss =Lossfunction()
anchors = AnchorGenerator()
data = FaceDetectionDataset(train_dataset,anchors)
vali_data = FaceDetectionDataset(val_dataset,anchors)

In [31]:
torch.cuda.empty_cache()
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Allocated: 0.03 GB
Cached: 0.05 GB


In [32]:
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Allocated: 0.03 GB
Cached: 0.05 GB


In [6]:
# batch_size = 16
# epochs = 20
# learning_rate = 1e-3
def forward(p):
    features = extractor.extract(p)
    newfeatures = topdown(features)
    output = {}
    for key in list(newfeatures.keys()):
        temp = {}
        temp["bbox"] = bboxregression(newfeatures[key])
        temp["cls"] = classifier(newfeatures[key])
        output[key] = temp
    return output

def train(epochs:int,training_data , validation_data=None):
    extractor.train()
    topdown.train()
    classifier.train()
    bboxregression.train()
    optimizer = optim.Adam(list(extractor.parameters())+
                           list(topdown.parameters())+
                           list(classifier.parameters())+
                           list(bboxregression.parameters()), lr=learning_rate)
    loSS = {}
    for i in range (0,epochs):
        train_loss= 0
        for key, (image,bbox) in enumerate(training_data):
            epoch_loss = {}
            model_pred = forward(image)
            optimizer.zero_grad()
            ll = loss(model_pred, bbox)
            train_loss+=ll
            if (key % 10 == 0):
                epoch_loss[key] = train_loss/10
                print(f"The avg loss for {key}th data is {train_loss/10}")
                train_loss = 0    
            ll.backward()
            optimizer.step()
        loSS[i] = epoch_loss
    return loSS

# train(epochs=epochs,training_data=training_data)


In [7]:
total_params=sum(p.numel() for p in extractor.parameters())
total_params+= sum(p.numel() for p in topdown.parameters())
total_params+= sum(p.numel() for p in classifier.parameters())
total_params+= sum(p.numel() for p in bboxregression.parameters())

print(total_params)

7452536


In [11]:
scaler = GradScaler()

  scaler = GradScaler()


In [8]:
def train_with_accumulation(epochs: int, training_data, validation_data=None, accumulation_steps=4):
    # Set models to training mode
    extractor.train()
    topdown.train()
    classifier.train()
    bboxregression.train()
    
    # Enable gradient checkpointing
    if hasattr(extractor, 'gradient_checkpointing_enable'):
        extractor.gradient_checkpointing_enable()
    if hasattr(topdown, 'gradient_checkpointing_enable'):
        topdown.gradient_checkpointing_enable()
    
    optimizer = optim.Adam(
        list(extractor.parameters()) +
        list(topdown.parameters()) +
        list(classifier.parameters()) +
        list(bboxregression.parameters()), 
        lr=learning_rate
    )
    
    loss_history = {}
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        batch_count = 0
        running_loss = 0.0
        
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 30)
        try:
            for batch_idx, (image, bbox) in enumerate(training_data):
                # Mixed precision forward pass
                optimizer.zero_grad()
                
                with autocast('cuda'):
                    model_pred = forward(image.cuda())
                    ll = loss(model_pred, bbox) / accumulation_steps  # Scale loss
                del image , model_pred , bbox
                # Backward pass
                scaler.scale(ll).backward()
                
                # Step optimizer every accumulation_steps
                if (batch_idx + 1) % accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                
                # Accumulate loss
                batch_loss = ll.item() * accumulation_steps  # Unscale for logging
                epoch_loss += batch_loss
                running_loss += batch_loss
                batch_count += 1
                
                # Print progress
                if (batch_idx + 1) % 10 == 0:
                    avg_running_loss = running_loss / 10
                    print(f"Batch {batch_idx + 1}: Avg Loss = {avg_running_loss:.6f}")
                    running_loss = 0.0
                
                # Memory 
                del ll
                torch.cuda.empty_cache()
                gc.collect()
        except Exception as e :
            print (e)
            print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
            torch.cuda.empty_cache()
            print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
            print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
            break

        except KeyboardInterrupt:
            print("INTERRUPTED!!")
            return loss_history   

                
        if batch_count % accumulation_steps != 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        avg_epoch_loss = epoch_loss / batch_count if batch_count > 0 else 0
        loss_history[epoch] = avg_epoch_loss
        
        print(f"Epoch {epoch + 1} Average Loss: {avg_epoch_loss:.6f}")
        
        # Cleanup at epoch end
        gc.collect()
        torch.cuda.empty_cache()
    
    return loss_history


In [9]:
torch.cuda.empty_cache()
batch_size = 8
epochs = 5
learning_rate = 1e-3
training_data = DataLoader(
    data,
    batch_size=batch_size, 
    num_workers=4,     # Reduce if GPU memory is full
    pin_memory=True,         # Faster CPU->GPU transfer
    persistent_workers=True, # Reuse workers across epochs
    prefetch_factor=4,       # Prefetch more batches
    drop_last=True,
    shuffle=True
)
validation_data = DataLoader(vali_data,
    batch_size=batch_size, 
    num_workers=4,     # Reduce if GPU memory is full
    pin_memory=True,         # Faster CPU->GPU transfer
    persistent_workers=True, # Reuse workers across epochs
    prefetch_factor=4,       # Prefetch more batches
    drop_last=True,
    shuffle=True
)

In [None]:
for batch_idx, (image, bbox) in enumerate(training_data):
    print(batch_idx)

In [12]:
lossdata = train_with_accumulation(epochs = 1, training_data = training_data)


Epoch 1/1
------------------------------


Batch 10: Avg Loss = 4.635370
Batch 20: Avg Loss = 3.847629
Batch 30: Avg Loss = 3.065225
Batch 40: Avg Loss = 2.512710
Batch 50: Avg Loss = 2.583414
Batch 60: Avg Loss = 2.599735
Batch 70: Avg Loss = 2.237441
Batch 80: Avg Loss = 1.873564
Batch 90: Avg Loss = 1.842454
Batch 100: Avg Loss = 1.843941
Batch 110: Avg Loss = 1.896948
Batch 120: Avg Loss = 1.933510
Batch 130: Avg Loss = 1.836959
Batch 140: Avg Loss = 1.669875
Batch 150: Avg Loss = 1.826366
Batch 160: Avg Loss = 1.541771
Batch 170: Avg Loss = 1.756321
Batch 180: Avg Loss = 1.539908
Batch 190: Avg Loss = 1.407302
Batch 200: Avg Loss = 1.472674
Batch 210: Avg Loss = 1.366209
Batch 220: Avg Loss = 1.345408
Batch 230: Avg Loss = 1.308168
Batch 240: Avg Loss = 1.286006
Batch 250: Avg Loss = 1.251661
Batch 260: Avg Loss = 1.213987
Batch 270: Avg Loss = 1.206796
INTERRUPTED!!


In [40]:
loss = Lossfunction()
extractor.eval()
topdown.eval()
classifier.eval()
bboxregression.eval()
accumulation_steps = 4
for batch_idx, (image, bbox) in enumerate(validation_data):

    with autocast('cuda'):
        model_pred = forward(image.cuda())
        ll = loss(model_pred, bbox) / accumulation_steps  # Scale loss
    del image , model_pred , bbox
    print(ll)

tensor(0.2135, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2135, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2264, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2173, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2080, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2256, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2224, device='cuda:0', grad_fn=<DivBackward0>)
tensor(0.2076, device='cuda:0', grad_fn=<DivBackward0>)


KeyboardInterrupt: 

In [19]:
torch.cuda.empty_cache()
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

Allocated: 2.11 GB
Cached: 2.21 GB


In [20]:
data[1000][0].unsqueeze(0).shape

torch.Size([1, 3, 640, 640])

In [24]:
p =(s['18']['cls'].squeeze(0).squeeze(0).T>0.5).cpu().sum()

In [25]:
p

tensor(4036)

In [13]:
s=forward(data[100][0].unsqueeze(0).cuda())

In [20]:
t= vali_data[1200][1]

In [None]:
data[100][1][2]['bbox']

{'cls_targets': tensor([[0],
         [0],
         [0],
         ...,
         [0],
         [0],
         [0]]),
 'bbox_targets': tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]),
 'bbox_weights': tensor([0., 0., 0.,  ..., 0., 0., 0.])}

In [24]:
(s['13']['bbox'])


tensor([[[0.2843, 0.0000, 0.2334, 0.1365],
         [0.2856, 0.0000, 0.2586, 0.0759],
         [0.2784, 0.0000, 0.2265, 0.0926],
         ...,
         [0.0000, 0.2042, 0.2086, 0.2275],
         [0.0000, 0.2196, 0.2200, 0.2427],
         [0.0000, 0.1941, 0.1840, 0.2735]]], device='cuda:0',
       grad_fn=<ViewBackward0>)

In [30]:
t[0]['cls_targets'].sum()

tensor(0)

In [21]:
ls = loss(s,t)

In [22]:
ls

tensor(0.5198, device='cuda:0', grad_fn=<AddBackward0>)

In [26]:
s['18']['bbox'].squeeze(0).cpu()


tensor([[0.4933, 0.5469, 0.4821, 0.4534],
        [0.4507, 0.4666, 0.4542, 0.4803],
        [0.4810, 0.4796, 0.4658, 0.4742],
        ...,
        [0.4389, 0.0000, 0.4425, 0.0434],
        [0.4311, 0.0000, 0.4414, 0.0056],
        [0.4147, 0.0000, 0.3418, 0.0000]], grad_fn=<ToCopyBackward0>)

In [None]:

(data[1000][1][0]['bbox_targets'] - s['18']['bbox'].squeeze(0).cpu())


tensor(-0.1751, grad_fn=<MeanBackward0>)

In [80]:
model3 = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)
model3 = model3.features.to(device)

Using cache found in /home/.cache/torch/hub/pytorch_vision_v0.10.0


In [None]:
torch.all(list(extractor.parameters()) == list(model3.parameters()))

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [75]:
torch.save(extractor.state_dict(),'/home/faces2.0/models/extractor.pt')
torch.save(topdown.state_dict(),'/home/faces2.0/models/topdown.pt')
torch.save(classifier.state_dict(),'/home/faces2.0/models/classifier.pt')
torch.save(bboxregression.state_dict(),'/home/faces2.0/models/bboxregression.pt')

