In [None]:
import subprocess
import os
import signal
import re

def get_gpu_processes():
    result = subprocess.run(['nvidia-smi', '--query-compute-apps=pid', '--format=csv,noheader'], stdout=subprocess.PIPE)
    output = result.stdout.decode('utf-8')
    pids = [int(pid) for pid in output.strip().split('\n') if pid]
    return pids

def kill_processes(pids):
    for pid in pids:
        try:
            os.kill(pid, signal.SIGKILL)
            print(f"Process {pid} has been killed.")
        except Exception as e:
            print(f"Could not kill process {pid}: {e}")

if __name__ == "__main__":
    gpu_pids = get_gpu_processes()
    if gpu_pids:

        print(f"Found processes running on GPU: {gpu_pids}")
        kill_processes(gpu_pids)
    else:
        print("No GPU processes found.")


In [None]:
import os
import tensorflow as tf
from object_detection.utils import config_util
from object_detection.builders import model_builder
from object_detection.utils import label_map_util
from object_detection.data_decoders.tf_example_decoder import TfExampleDecoder
from requests.adapters import HTTPAdapter
from requests.sessions import Session
import tensorflow as tf
from object_detection import model_lib_v2
import neptune.new as neptune
from collections import defaultdict

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

def run_training_and_evaluation(run,config_file, model_dir):
    config_dir = "/home/rk42218/Building_Detection/Config_v2/"
    pipeline_config_path = os.path.join(config_dir, config_file)
    model_dir = model_dir
    print(pipeline_config_path)
    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        print("Starting training...")
        model_lib_v2.train_loop(
            pipeline_config_path=pipeline_config_path,
            model_dir=model_dir,
            use_tpu=False,
            num_steps = 2000,
            checkpoint_every_n=200,  
            record_summaries=True,
            neptune_run=run,
        )

def run_evaluation(run,pipeline_config_path, model_dir):
    print("Starting Evaluation...")
    session = Session()
    adapter = HTTPAdapter(pool_connections=20, pool_maxsize=20)
    session.mount('https://app.neptune.ai', adapter)

    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
    model_config = configs['model']
    detection_model = model_builder.build(model_config=model_config, is_training=False) 

    
    feature_description = {
        'image/encoded': tf.io.FixedLenFeature([], tf.string),
        'image/source_id': tf.io.FixedLenFeature([], tf.string),
        'image/height': tf.io.FixedLenFeature([], tf.int64),
        'image/width': tf.io.FixedLenFeature([], tf.int64),
        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
        'image/object/class/text': tf.io.VarLenFeature(tf.string),
        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
        'image/object/mask': tf.io.VarLenFeature(tf.string),  
    }

    def parse_tfrecord_fn(example_proto):
        example = tf.io.parse_single_example(example_proto, feature_description)
        image = tf.image.decode_jpeg(example['image/encoded'], channels=3)
        image = tf.image.convert_image_dtype(image, tf.float32)
        image = tf.image.resize(image, [1024, 1024])  
        if 'image/object/mask' in example:
            masks = tf.sparse.to_dense(example['image/object/mask'], default_value='')
            masks = tf.map_fn(lambda x: tf.image.decode_png(x, channels=1), masks, dtype=tf.uint8)

        return image, example['image/object/bbox/xmin'], example['image/object/bbox/xmax'], example['image/object/bbox/ymin'], example['image/object/bbox/ymax'], example['image/object/class/label']

    test_record_path = '/home/rk42218/DATA_SET_1024/data_split/val/val.record'
    test_dataset = tf.data.TFRecordDataset(test_record_path)
    test_dataset = test_dataset.map(parse_tfrecord_fn)
    test_dataset = test_dataset.batch(1)  

    @tf.function
    def detect_fn(image_tensor):
        _, shapes = detection_model.preprocess(image_tensor)
        prediction_dict = detection_model.predict(image_tensor, shapes)
        detections = detection_model.postprocess(prediction_dict, shapes)
        return detections

    def compute_iou(boxes1, boxes2):
        ymin1, xmin1, ymax1, xmax1 = tf.split(boxes1, 4, axis=-1)
        ymin2, xmin2, ymax2, xmax2 = tf.split(boxes2, 4, axis=-1)

        inter_ymin = tf.maximum(ymin1, tf.transpose(ymin2))
        inter_xmin = tf.maximum(xmin1, tf.transpose(xmin2))
        inter_ymax = tf.minimum(ymax1, tf.transpose(ymax2))
        inter_xmax = tf.minimum(xmax1, tf.transpose(xmax2))

        inter_area = tf.maximum(inter_ymax - inter_ymin, 0) * tf.maximum(inter_xmax - inter_xmin, 0)
        area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
        area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
        union_area = area1 + tf.transpose(area2) - inter_area
        return inter_area / union_area

    def calculate_average_precision(recalls, precisions):
        mrec = [0] + recalls + [1]
        mpre = [0] + precisions + [0]
        for i in range(len(mpre) - 2, -1, -1):
            mpre[i] = max(mpre[i], mpre[i + 1])
        area = 0.0
        for i in range(1, len(mrec)):
            area += (mrec[i] - mrec[i - 1]) * mpre[i]
        return area

    def calculate_f1_score(precision, recall):
        return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    def calculate_precision_recall(gt_boxes, pred_boxes, iou_threshold=0.1):
        if not pred_boxes.shape[0] or not gt_boxes.shape[0]:
            return 0.0, 0.0  

        ious = compute_iou(tf.expand_dims(pred_boxes, 1), tf.expand_dims(gt_boxes, 0))
        best_iou = tf.reduce_max(ious, axis=2)
        best_gt_idx = tf.argmax(ious, axis=2)
        true_positives = tf.reduce_sum(tf.cast(best_iou >= iou_threshold, tf.float32), axis=1)
        false_positives = tf.reduce_sum(tf.cast(best_iou < iou_threshold, tf.float32), axis=1)
        false_negatives = tf.cast(tf.shape(gt_boxes)[0], tf.float32) - true_positives
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        return tf.reduce_mean(precision).numpy(), tf.reduce_mean(recall).numpy() 

    iou_thresholds = [0.5, 0.75, 0.85] 
    num_classes = 2  
    def run_detection_for_all_checkpoints(ckpt_dir):
        ckpt_files = [file for file in os.listdir(ckpt_dir) if file.endswith('.index')]
        ckpt_files.sort(key=lambda x: int(x.split('-')[1].split('.')[0]))
        for ckpt_file in ckpt_files:
            print("Evaluating checkpoint:", ckpt_file)
            ckpt_path = os.path.join(ckpt_dir, ckpt_file[:-6])  # Remove '.index' extension
            ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
            ckpt.restore(ckpt_path).expect_partial()

            mean_iou_metric_path = "metrics/mean_iou"
            average_precision_metric_path = "metrics/average_precision"

            run["metrics/checkpoint_marker"].log("Checkpoint: " + ckpt_file[:-6])

            all_iou_scores = []
            all_precisions = []
            all_recalls = []

            batch_count = 0  # Initialize batch counter
            for image, ymin, xmin, ymax, xmax, labels in test_dataset:
                detections = detect_fn(image)
                predicted_boxes = detections['detection_boxes'][0]
                predicted_scores = detections['detection_scores'][0]

                indices = tf.where(predicted_scores > 0.01)
                filtered_boxes = tf.gather(predicted_boxes, indices)

                ymin_dense = tf.sparse.to_dense(ymin, default_value=0)
                xmin_dense = tf.sparse.to_dense(xmin, default_value=0)
                ymax_dense = tf.sparse.to_dense(ymax, default_value=0)
                xmax_dense = tf.sparse.to_dense(xmax, default_value=0)
                gt_boxes = tf.stack([ymin_dense, xmin_dense, ymax_dense, xmax_dense], axis=-1)

                if tf.shape(filtered_boxes)[0] > 0 and tf.shape(gt_boxes)[0] > 0:
                    precision, recall = calculate_precision_recall(gt_boxes, filtered_boxes)
                    f1_score = calculate_f1_score(precision, recall)
                    iou_scores = compute_iou(gt_boxes, filtered_boxes)
                    all_iou_scores.extend(iou_scores.numpy().tolist())
                    all_precisions.append(precision)
                    all_recalls.append(recall)
                    run[f"metrics/{ckpt_file[:-6]}/precision"].log(precision)
                    run[f"metrics/{ckpt_file[:-6]}/recall"].log(recall)
                    run[f"metrics/{ckpt_file[:-6]}/f1_score"].log(f1_score)
                    if iou_scores.numpy().size > 0:
                        run[f"metrics/{ckpt_file[:-6]}/iou_scores"].log(iou_scores.numpy().tolist())

                batch_count += 1  # Increment batch count
                print(f"Processed batch {batch_count} for checkpoint {ckpt_file[:-6]} for {pipeline_config_path}")

            print(f"Total batches processed for checkpoint {ckpt_file[:-6]}: {batch_count} for {pipeline_config_path}")

            if all_iou_scores:
                flat_iou_scores = [item for sublist in all_iou_scores for item in sublist]
                numeric_iou_scores = [score for score in flat_iou_scores if isinstance(score, (int, float))]
                mean_iou = sum(numeric_iou_scores) / len(numeric_iou_scores) if numeric_iou_scores else 0.0
                average_precision = calculate_average_precision(all_recalls, all_precisions)
                run[mean_iou_metric_path].log(mean_iou)
                run[average_precision_metric_path].log(average_precision)

        run["model/config"].log(str(pipeline_config_path))

    run_detection_for_all_checkpoints(model_dir)
        
def main():
    neptune_project= "ruthikkale27/Building-Detection-Train-Eval01"
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI3ZjllMWM4My1iNzFkLTQ3ZDgtOGU4NC1kN2Y2YmZjMTVmM2QifQ=="
    config_dir = "/home/rk42218/Building_Detection/Config_v2/"
    model_dir_path = "/scratch/rk42218/Building_Detection_scratch/training_output_new_Config_v2/"
    test_record_path = '/home/rk42218/DATA_SET_1024/data_split/val/val.record'

    config_to_run_id = {
        "4000_32_5e6_v2.config": "TRAIN-85",
        "4000_32_5e7_v2.config": "TRAIN-84",
        "2000_32_2e6_v2.config": "TRAIN-83",
        "2000_32_2e7_v2.config": "TRAIN-82",
        "2000_32_5e7_v1.config": "TRAIN-81",
        "2000_32_5e6_v1.config": "TRAIN-80",
        "4000_32_2e7_v1.config": "TRAIN-79",
        "4000_32_2e6_v1.config": "TRAIN-78",
        "4000_32_2e6_v2.config": "TRAIN-77",
        "4000_32_2e7_v2.config": "TRAIN-76",
        "2000_32_5e6_v2.config": "TRAIN-75",
        "2000_32_5e7_v2.config": "TRAIN-74",
        "2000_32_2e7_v1.config": "TRAIN-73",
        "2000_32_2e6_v1.config": "TRAIN-72",
        "4000_32_5e7_v1.config": "TRAIN-71",
        "4000_32_5e6_v1.config": "TRAIN-70"
    }


    config_files = os.listdir(config_dir)
    for config_file in config_files:
        if config_file.endswith('.config'):
            run_id = config_to_run_id.get(config_file)
            print(f"Run ID for {config_file}: {run_id}")  # Debug output

            if run_id:
                run = neptune.init_run(
                    project=neptune_project,
                    api_token=api_token,
                    with_id=run_id  # Resume an existing run
                )
            else:
                run = neptune.init_run(
                    project=neptune_project,
                    api_token=api_token,
                    tags=['new_run', config_file[:-7]]  # Tagging new run with config name
                )
                print(f"Starting new Neptune run for {config_file}")

            pipeline_config_path = os.path.join(config_dir, config_file)
            model_dir = os.path.join(model_dir_path, config_file[:-7])
            print("Processing config:", config_file)
            print("Model directory:", model_dir)

            run_training_and_evaluation(run, pipeline_config_path, model_dir)
            #run_evaluation(run, pipeline_config_path, model_dir)

            run.stop()

if __name__ == "__main__":
    main()