In [None]:
#数据链接
#https://www.kaggle.com/competitions/rsna-2024-lumbar-spine-degenerative-classification
#https://www.kaggle.com/datasets/namgalielei/ldsc-metadata
#https://www.kaggle.com/code/namgalielei/lsdc-fold-split

In [None]:
%%writefile scs_yolo_generate.py
import os
import pandas as pd
import numpy as np
import pydicom
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import cv2
import glob

IMG_DIR = "/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images"

FOLDS = [0,1]
OD_INPUT_SIZE = 768
STD_BOX_SIZE = 56
SAMPLE = None
CONDITIONS = ['Spinal Canal Stenosis']
SEVERITIES = ['Normal/Mild', 'Moderate', 'Severe']
LEVELS = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']

# rm -rf val_fold0

train_val_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')
train_xy = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv')
train_des = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv')

if SAMPLE:
    train_val_df = train_val_df.sample(SAMPLE, random_state=2698)

fold_df = pd.read_csv('/kaggle/input/lsdc-fold-split/5folds.csv')

train_xy.head(3)

def get_level(text):
    for lev in ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']:
        if lev in text:
            split = lev.split('_')
            split[0] = split[0].capitalize()
            split[1] = split[1].capitalize()
            return '/'.join(split)
    raise ValueError('Level not found '+ lev)
    
def get_condition(text):
    split = text.split('_')
    for i in range(len(split)):
        split[i] = split[i].capitalize()
    split = split[:-2]
    return ' '.join(split)
#     raise ValueError('Condition not found '+ lev)

train_xy['condition'].unique()

# train_df = train_df.dropna()

label_df = {'study_id':[], 'condition': [], 'level':[], 'label':[]}

for i, row in train_val_df.iterrows():
    study_id = row['study_id']
    for k, label in row.iloc[1:].to_dict().items():
        level = get_level(k)
        condition = get_condition(k)
        label_df['study_id'].append(study_id)
        label_df['condition'].append(condition)
        label_df['level'].append(level)
        label_df['label'].append(label)
#         break
#     break

label_df = pd.DataFrame(label_df)
label_df = label_df.merge(fold_df, on='study_id')

train_xy = train_xy.merge(train_des, how='inner', on=['study_id', 'series_id'])
label_df = label_df.merge(train_xy, how='inner', on=['study_id', 'condition', 'level'])

# label_df[label_df.series_id.isna()]

# cnt = train_xy.groupby(['study_id', 'series_id', 'instance_number'])['condition'].nunique()

# cnt[cnt>1]

def query_train_xy_row(study_id, series_id=None, instance_num=None):
    if series_id is not None and instance_num is not None:
        return label_df[(label_df.study_id==study_id) & (label_df.series_id==series_id) &
            (label_df.instance_number==instance_num)]
    elif series_id is None and instance_num is None:
        return label_df[(label_df.study_id==study_id)]
    else:
        return label_df[(train_xy.study_id==study_id) & (label_df.series_id==series_id)]

# import os

# def count_dcm_files(directory):
#     dcm_count = 0
#     for root, dirs, files in os.walk(directory):
#         for file in files:
#             if file.endswith('.dcm'):
#                 dcm_count += 1
#     return dcm_count

# dcm_files_count = count_dcm_files(IMG_DIR)

# print(f"Number of .dcm files: {dcm_files_count}")

def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

def get_accronym(text):
    split = text.split(' ')
    return ''.join([x[0] for x in split])

# study_id = 4003253 
# series_id = 2448190387
# instance_num = 28

ex = label_df.sample(1).iloc[0]
study_id = ex.study_id
series_id = ex.series_id
instance_num = ex.instance_number

WIDTH = 10

path = os.path.join(IMG_DIR, str(study_id), str(series_id), f'{instance_num}.dcm')

img = read_dcm(path)

tmp_df = query_train_xy_row(study_id, series_id, instance_num)
for i, row in tmp_df.iterrows():
    lbl = f"{get_accronym(row['condition'])}_{row['level']}"
    x, y = row['x'], row['y']
    x1 = int(x - WIDTH)
    x2 = int(x + WIDTH)
    y1 = int(y - WIDTH)
    y2 = int(y + WIDTH)
    color = None
    if row['label'] == 'Normal/Mild':
        color =  (0, 255, 0)
    elif row['label'] == 'Moderate':
        color = (255,255,0) 
    elif row['label'] == 'Severe':
        color = (255,0,0)
        
    fontFace = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    thickness = 1
    cv2.rectangle(img, (x1,y1), (x2,y2), color, 2)
    cv2.putText(img, lbl, (x1,y1), fontFace, fontScale, color, thickness, cv2.LINE_AA)

tmp_df

plt.imshow(img)
plt.show()

# label_df[['study_id', 'series_id']].drop_duplicates()

def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

filtered_df = label_df[label_df.condition.map(lambda x: x in CONDITIONS)]

label2id = {}
id2label = {}
i = 0
for cond in CONDITIONS:
    for level in LEVELS:
        for severity in SEVERITIES:
            cls_ = f"{cond.lower().replace(' ', '_')}_{level}_{severity.lower()}"
            label2id[cls_] = i
            id2label[i] = cls_
            i+=1

id2label

def gen_yolo_format(ann_df, phase='train'):
    for name, group in tqdm(ann_df.groupby(['study_id', 'series_id', 'instance_number'])):
        study_id, series_id, instance_num = name[0], name[1], name[2]
        path = f'{IMG_DIR}/{study_id}/{series_id}/{instance_num}.dcm'
        img = read_dcm(path)
        H, W = img.shape[:2]

        img_dir = os.path.join(OUT_DIR, 'images', phase)
        os.makedirs(img_dir, exist_ok=True)
        img_path = os.path.join(img_dir, f'{study_id}_{series_id}_{instance_num}.jpg')
        cv2.imwrite(img_path, img)

        ann_dir = os.path.join(OUT_DIR, 'labels', phase)
        os.makedirs(ann_dir, exist_ok=True)
        ann_path = os.path.join(ann_dir, f'{study_id}_{series_id}_{instance_num}.txt')
        if "Severe" in group["label"].tolist():
            group=group[group["label"]=="Severe"].reset_index(drop=True)
        with open(ann_path, 'w') as f:
            for i, row in group.iterrows():
                cond = row['condition']
                level = row['level']
                severity = row['label']
                class_label = f"{cond.lower().replace(' ', '_')}_{level.lower().replace('/', '_')}_{severity.lower()}"
                class_id = label2id[class_label]
                x_center = row['x'] / W
                y_center = row['y'] / H
                width = W / OD_INPUT_SIZE * STD_BOX_SIZE / W
                height = H /  OD_INPUT_SIZE * STD_BOX_SIZE / H
                f.write(f'{class_id} {x_center} {y_center} {width} {height}\n')

#         break

for FOLD in FOLDS:
    print('Gen data fold', FOLD)
    OUT_DIR = f'data_fold{FOLD}'
    os.makedirs(OUT_DIR, exist_ok=True)
    
    train_df = filtered_df[filtered_df.fold != FOLD]
    val_df = filtered_df[filtered_df.fold == FOLD]
    
    gen_yolo_format(train_df, phase='train')
    gen_yolo_format(val_df, phase='val')

In [None]:
!python scs_yolo_generate.py

In [None]:
!zip -r -q scs_data_fold0.zip data_fold0

!rm -rf data_fold0
!zip -r -q scs_data_fold1.zip data_fold1

!rm -rf data_fold1

In [None]:
%%writefile ss_yolo_generate.py
import os
import pandas as pd
import numpy as np
import pydicom
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import cv2
import glob

IMG_DIR = "/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images"

FOLDS = [0,1]
OD_INPUT_SIZE = 768
STD_BOX_SIZE = 56
SAMPLE = None
CONDITIONS = ['Left Subarticular Stenosis', 'Right Subarticular Stenosis']
SEVERITIES = ['Normal/Mild', 'Moderate', 'Severe']
LEVELS = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']


# rm -rf val_fold0

train_val_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')
train_xy = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv')
train_des = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv')

if SAMPLE:
    train_val_df = train_val_df.sample(SAMPLE, random_state=2698)

fold_df = pd.read_csv('/kaggle/input/lsdc-fold-split/5folds.csv')

train_xy.head(3)

def get_level(text):
    for lev in ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']:
        if lev in text:
            split = lev.split('_')
            split[0] = split[0].capitalize()
            split[1] = split[1].capitalize()
            return '/'.join(split)
    raise ValueError('Level not found '+ lev)
    
def get_condition(text):
    split = text.split('_')
    for i in range(len(split)):
        split[i] = split[i].capitalize()
    split = split[:-2]
    return ' '.join(split)
#     raise ValueError('Condition not found '+ lev)

train_xy['condition'].unique()

# train_df = train_df.dropna()

label_df = {'study_id':[], 'condition': [], 'level':[], 'label':[]}

for i, row in train_val_df.iterrows():
    study_id = row['study_id']
    for k, label in row.iloc[1:].to_dict().items():
        level = get_level(k)
        condition = get_condition(k)
        label_df['study_id'].append(study_id)
        label_df['condition'].append(condition)
        label_df['level'].append(level)
        label_df['label'].append(label)
#         break
#     break

label_df = pd.DataFrame(label_df)
label_df = label_df.merge(fold_df, on='study_id')

train_xy = train_xy.merge(train_des, how='inner', on=['study_id', 'series_id'])
label_df = label_df.merge(train_xy, how='inner', on=['study_id', 'condition', 'level'])

# cnt[cnt>1]

def query_train_xy_row(study_id, series_id=None, instance_num=None):
    if series_id is not None and instance_num is not None:
        return label_df[(label_df.study_id==study_id) & (label_df.series_id==series_id) &
            (label_df.instance_number==instance_num)]
    elif series_id is None and instance_num is None:
        return label_df[(label_df.study_id==study_id)]
    else:
        return label_df[(train_xy.study_id==study_id) & (label_df.series_id==series_id)]

# import os

# def count_dcm_files(directory):
#     dcm_count = 0
#     for root, dirs, files in os.walk(directory):
#         for file in files:
#             if file.endswith('.dcm'):
#                 dcm_count += 1
#     return dcm_count

# dcm_files_count = count_dcm_files(IMG_DIR)

# print(f"Number of .dcm files: {dcm_files_count}")

def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

def get_accronym(text):
    split = text.split(' ')
    return ''.join([x[0] for x in split])

# study_id = 4003253 
# series_id = 2448190387
# instance_num = 28

ex = label_df.sample(1).iloc[0]
study_id = ex.study_id
series_id = ex.series_id
instance_num = ex.instance_number

WIDTH = 10

path = os.path.join(IMG_DIR, str(study_id), str(series_id), f'{instance_num}.dcm')

img = read_dcm(path)

tmp_df = query_train_xy_row(study_id, series_id, instance_num)
for i, row in tmp_df.iterrows():
    lbl = f"{get_accronym(row['condition'])}_{row['level']}"
    x, y = row['x'], row['y']
    x1 = int(x - WIDTH)
    x2 = int(x + WIDTH)
    y1 = int(y - WIDTH)
    y2 = int(y + WIDTH)
    color = None
    if row['label'] == 'Normal/Mild':
        color =  (0, 255, 0)
    elif row['label'] == 'Moderate':
        color = (255,255,0) 
    elif row['label'] == 'Severe':
        color = (255,0,0)
        
    fontFace = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    thickness = 1
    cv2.rectangle(img, (x1,y1), (x2,y2), color, 2)
    cv2.putText(img, lbl, (x1,y1), fontFace, fontScale, color, thickness, cv2.LINE_AA)

tmp_df

plt.imshow(img)
plt.show()

# label_df[['study_id', 'series_id']].drop_duplicates()

def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

filtered_df = label_df[label_df.condition.map(lambda x: x in CONDITIONS)]

label2id = {}
id2label = {}
i = 0
for cond in CONDITIONS:
    for level in LEVELS:
        for severity in SEVERITIES:
            cls_ = f"{cond.lower().replace(' ', '_')}_{level}_{severity.lower()}"
            label2id[cls_] = i
            id2label[i] = cls_
            i+=1

id2label

def gen_yolo_format(ann_df, phase='train'):
    for name, group in tqdm(ann_df.groupby(['study_id', 'series_id', 'instance_number'])):
        study_id, series_id, instance_num = name[0], name[1], name[2]
        path = f'{IMG_DIR}/{study_id}/{series_id}/{instance_num}.dcm'
        img = read_dcm(path)
        H, W = img.shape[:2]

        img_dir = os.path.join(OUT_DIR, 'images', phase)
        os.makedirs(img_dir, exist_ok=True)
        img_path = os.path.join(img_dir, f'{study_id}_{series_id}_{instance_num}.jpg')
        cv2.imwrite(img_path, img)

        ann_dir = os.path.join(OUT_DIR, 'labels', phase)
        os.makedirs(ann_dir, exist_ok=True)
        ann_path = os.path.join(ann_dir, f'{study_id}_{series_id}_{instance_num}.txt')
        if "Severe" in group["label"].tolist():
            group=group[group["label"]=="Severe"].reset_index(drop=True)
        contain_nulls = False
        
        with open(ann_path, 'w') as f:
            for i, row in group.iterrows():
                cond = row['condition']
                level = row['level']
                severity = row['label']
                if pd.isnull(severity):
                    contain_nulls = True
                    break
                class_label = f"{cond.lower().replace(' ', '_')}_{level.lower().replace('/', '_')}_{severity.lower()}"
                class_id = label2id[class_label]
                x_center = row['x'] / W
                y_center = row['y'] / H
                width = W / OD_INPUT_SIZE * STD_BOX_SIZE / W
                height = H /  OD_INPUT_SIZE * STD_BOX_SIZE / H
                f.write(f'{class_id} {x_center} {y_center} {width} {height}\n')
        
        if not contain_nulls:
            cv2.imwrite(img_path, img)
#         break

for FOLD in FOLDS:
    print('Gen data fold', FOLD)
    OUT_DIR = f'data_fold{FOLD}'
    os.makedirs(OUT_DIR, exist_ok=True)
    
    train_df = filtered_df[filtered_df.fold != FOLD]
    val_df = filtered_df[filtered_df.fold == FOLD]
    
    gen_yolo_format(train_df, phase='train')
    gen_yolo_format(val_df, phase='val')



In [None]:
!python ss_yolo_generate.py

In [None]:
!zip -r -q ss_data_fold0.zip data_fold0
!rm -rf data_fold0
!zip -r -q ss_data_fold1.zip data_fold1
!rm -rf data_fold1

In [None]:
%%writefile nfn_yolo_generate.py
import os
import pandas as pd
import numpy as np
import pydicom
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import cv2
import glob

IMG_DIR = "/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images"

FOLDS = [0,1]
OD_INPUT_SIZE = 768
STD_BOX_SIZE = 56
SAMPLE = None
CONDITIONS = ['Left Neural Foraminal Narrowing', 'Right Neural Foraminal Narrowing']
SEVERITIES = ['Normal/Mild', 'Moderate', 'Severe']
LEVELS = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']

# rm -rf val_fold0

train_val_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')
train_xy = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv')
train_des = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv')

if SAMPLE:
    train_val_df = train_val_df.sample(SAMPLE, random_state=2698)

fold_df = pd.read_csv('/kaggle/input/lsdc-fold-split/5folds.csv')

train_xy.head(3)

def get_level(text):
    for lev in ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']:
        if lev in text:
            split = lev.split('_')
            split[0] = split[0].capitalize()
            split[1] = split[1].capitalize()
            return '/'.join(split)
    raise ValueError('Level not found '+ lev)
    
def get_condition(text):
    split = text.split('_')
    for i in range(len(split)):
        split[i] = split[i].capitalize()
    split = split[:-2]
    return ' '.join(split)
#     raise ValueError('Condition not found '+ lev)

train_xy['condition'].unique()

# train_df = train_df.dropna()

label_df = {'study_id':[], 'condition': [], 'level':[], 'label':[]}

for i, row in train_val_df.iterrows():
    study_id = row['study_id']
    for k, label in row.iloc[1:].to_dict().items():
        level = get_level(k)
        condition = get_condition(k)
        label_df['study_id'].append(study_id)
        label_df['condition'].append(condition)
        label_df['level'].append(level)
        label_df['label'].append(label)
#         break
#     break

label_df = pd.DataFrame(label_df)
label_df = label_df.merge(fold_df, on='study_id')

train_xy = train_xy.merge(train_des, how='inner', on=['study_id', 'series_id'])
label_df = label_df.merge(train_xy, how='inner', on=['study_id', 'condition', 'level'])

# label_df[label_df.series_id.isna()]

# cnt = train_xy.groupby(['study_id', 'series_id', 'instance_number'])['condition'].nunique()

# cnt[cnt>1]

def query_train_xy_row(study_id, series_id=None, instance_num=None):
    if series_id is not None and instance_num is not None:
        return label_df[(label_df.study_id==study_id) & (label_df.series_id==series_id) &
            (label_df.instance_number==instance_num)]
    elif series_id is None and instance_num is None:
        return label_df[(label_df.study_id==study_id)]
    else:
        return label_df[(train_xy.study_id==study_id) & (label_df.series_id==series_id)]

# import os

# def count_dcm_files(directory):
#     dcm_count = 0
#     for root, dirs, files in os.walk(directory):
#         for file in files:
#             if file.endswith('.dcm'):
#                 dcm_count += 1
#     return dcm_count

# dcm_files_count = count_dcm_files(IMG_DIR)

# print(f"Number of .dcm files: {dcm_files_count}")

def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

def get_accronym(text):
    split = text.split(' ')
    return ''.join([x[0] for x in split])

# study_id = 4003253 
# series_id = 2448190387
# instance_num = 28

ex = label_df.sample(1).iloc[0]
study_id = ex.study_id
series_id = ex.series_id
instance_num = ex.instance_number

WIDTH = 10

path = os.path.join(IMG_DIR, str(study_id), str(series_id), f'{instance_num}.dcm')

img = read_dcm(path)

tmp_df = query_train_xy_row(study_id, series_id, instance_num)
for i, row in tmp_df.iterrows():
    lbl = f"{get_accronym(row['condition'])}_{row['level']}"
    x, y = row['x'], row['y']
    x1 = int(x - WIDTH)
    x2 = int(x + WIDTH)
    y1 = int(y - WIDTH)
    y2 = int(y + WIDTH)
    color = None
    if row['label'] == 'Normal/Mild':
        color =  (0, 255, 0)
    elif row['label'] == 'Moderate':
        color = (255,255,0) 
    elif row['label'] == 'Severe':
        color = (255,0,0)
        
    fontFace = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    thickness = 1
    cv2.rectangle(img, (x1,y1), (x2,y2), color, 2)
    cv2.putText(img, lbl, (x1,y1), fontFace, fontScale, color, thickness, cv2.LINE_AA)

tmp_df

plt.imshow(img)
plt.show()

# label_df[['study_id', 'series_id']].drop_duplicates()

def read_dcm(src_path):
    dicom_data = pydicom.dcmread(src_path)
    image = dicom_data.pixel_array
    image = (image - image.min()) / (image.max() - image.min() +1e-6) * 255
    image = np.stack([image]*3, axis=-1).astype('uint8')
    return image

filtered_df = label_df[label_df.condition.map(lambda x: x in CONDITIONS)]

label2id = {}
id2label = {}
i = 0
for cond in CONDITIONS:
    for level in LEVELS:
        for severity in SEVERITIES:
            cls_ = f"{cond.lower().replace(' ', '_')}_{level}_{severity.lower()}"
            label2id[cls_] = i
            id2label[i] = cls_
            i+=1

id2label

def gen_yolo_format(ann_df, phase='train'):
    for name, group in tqdm(ann_df.groupby(['study_id', 'series_id', 'instance_number'])):
        study_id, series_id, instance_num = name[0], name[1], name[2]
        path = f'{IMG_DIR}/{study_id}/{series_id}/{instance_num}.dcm'
        img = read_dcm(path)
        H, W = img.shape[:2]

        img_dir = os.path.join(OUT_DIR, 'images', phase)
        os.makedirs(img_dir, exist_ok=True)
        img_path = os.path.join(img_dir, f'{study_id}_{series_id}_{instance_num}.jpg')
        cv2.imwrite(img_path, img)

        ann_dir = os.path.join(OUT_DIR, 'labels', phase)
        os.makedirs(ann_dir, exist_ok=True)
        ann_path = os.path.join(ann_dir, f'{study_id}_{series_id}_{instance_num}.txt')
        
        contain_nulls = False
        if "Severe" in group["label"].tolist():
            group=group[group["label"]=="Severe"].reset_index(drop=True)
        with open(ann_path, 'w') as f:
            for i, row in group.iterrows():
                cond = row['condition']
                level = row['level']
                severity = row['label']
                if pd.isnull(severity):
                    contain_nulls = True
                    break
                class_label = f"{cond.lower().replace(' ', '_')}_{level.lower().replace('/', '_')}_{severity.lower()}"
                class_id = label2id[class_label]
                x_center = row['x'] / W
                y_center = row['y'] / H
                width = W / OD_INPUT_SIZE * STD_BOX_SIZE / W
                height = H /  OD_INPUT_SIZE * STD_BOX_SIZE / H
                f.write(f'{class_id} {x_center} {y_center} {width} {height}\n')
        
        if not contain_nulls:
            cv2.imwrite(img_path, img)
#         break

for FOLD in FOLDS:
    print('Gen data fold', FOLD)
    OUT_DIR = f'data_fold{FOLD}'
    os.makedirs(OUT_DIR, exist_ok=True)
    
    train_df = filtered_df[filtered_df.fold != FOLD]
    val_df = filtered_df[filtered_df.fold == FOLD]
    
    gen_yolo_format(train_df, phase='train')
    gen_yolo_format(val_df, phase='val')



In [None]:
!python nfn_yolo_generate.py

In [None]:
!zip -r -q nfn_data_fold0.zip data_fold0
!rm -rf data_fold0
!zip -r -q nfn_data_fold1.zip data_fold1
!rm -rf data_fold1

In [None]:
!rm -r /kaggle/working/*.py