In [70]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [71]:
import warnings
warnings.filterwarnings('ignore')

In [72]:
xmlfiles = glob('./data_images/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [73]:
xmlfiles


['./data_images/12.xml',
 './data_images/16.xml',
 './data_images/17.xml',
 './data_images/18.xml',
 './data_images/19.xml',
 './data_images/20.xml',
 './data_images/21.xml',
 './data_images/22.xml',
 './data_images/23.xml',
 './data_images/24.xml',
 './data_images/25.xml',
 './data_images/a.xml',
 './data_images/b.xml',
 './data_images/c.xml',
 './data_images/d.xml',
 './data_images/e.xml',
 './data_images/ff.xml',
 './data_images/g.xml',
 './data_images/h.xml',
 './data_images/i.xml',
 './data_images/j.xml',
 './data_images/k.xml',
 './data_images/l.xml']

In [104]:
def extract_text(filename):
        tree = et.parse(filename)
        root = tree.getroot()
        # extract filename
        image_name = root.find('filename').text
        # width and height of the image
        width = root.find('size').find('width').text
        height = root.find('size').find('height').text
        objs = root.findall('object')
        parser = []
        for obj in objs:
            name = obj.find('name').text
            bndbox = obj.find('bndbox')
            xmin = bndbox.find('xmin').text
            xmax = bndbox.find('xmax').text
            ymin = bndbox.find('ymin').text
            ymax = bndbox.find('ymax').text
            parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        return parser

In [108]:
parser_all = list(map(extract_text,xmlfiles))

In [110]:
data = reduce(lambda x, y : x+y,parser_all)

In [112]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [113]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,12.jpg,1134,488,Doctor,493,598,113,277
1,12.jpg,1134,488,patient,162,238,266,351
2,12.jpg,1134,488,patient,608,679,194,271
3,12.jpg,1134,488,monitoring system,372,476,134,291
4,16.jpg,1500,1000,Doctor,811,1030,281,601


In [114]:
df.shape

(57, 8)

In [115]:
df['name'].value_counts()

name
patient              24
Doctor               15
monitoring system     8
Nurse                 5
family member         5
Name: count, dtype: int64

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  57 non-null     object
 1   width     57 non-null     object
 2   height    57 non-null     object
 3   name      57 non-null     object
 4   xmin      57 non-null     object
 5   xmax      57 non-null     object
 6   ymin      57 non-null     object
 7   ymax      57 non-null     object
dtypes: object(8)
memory usage: 3.7+ KB


In [117]:
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  57 non-null     object
 1   width     57 non-null     int64 
 2   height    57 non-null     int64 
 3   name      57 non-null     object
 4   xmin      57 non-null     int64 
 5   xmax      57 non-null     int64 
 6   ymin      57 non-null     int64 
 7   ymax      57 non-null     int64 
dtypes: int64(6), object(2)
memory usage: 3.7+ KB


In [118]:
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [119]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,12.jpg,1134,488,Doctor,493,598,113,277,0.481041,0.39959,0.092593,0.336066
1,12.jpg,1134,488,patient,162,238,266,351,0.176367,0.632172,0.067019,0.17418
2,12.jpg,1134,488,patient,608,679,194,271,0.56746,0.476434,0.06261,0.157787
3,12.jpg,1134,488,monitoring system,372,476,134,291,0.373898,0.435451,0.091711,0.321721
4,16.jpg,1500,1000,Doctor,811,1030,281,601,0.613667,0.441,0.146,0.32


In [120]:
images = df['filename'].unique()

In [121]:
len(images)

23

In [122]:
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [124]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [125]:
len(img_train), len(img_test)

(18, 5)

In [126]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [127]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,12.jpg,1134,488,Doctor,493,598,113,277,0.481041,0.39959,0.092593,0.336066
1,12.jpg,1134,488,patient,162,238,266,351,0.176367,0.632172,0.067019,0.17418
2,12.jpg,1134,488,patient,608,679,194,271,0.56746,0.476434,0.06261,0.157787
3,12.jpg,1134,488,monitoring system,372,476,134,291,0.373898,0.435451,0.091711,0.321721
4,16.jpg,1500,1000,Doctor,811,1030,281,601,0.613667,0.441,0.146,0.32


In [128]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
7,17.jpg,465,279,patient,126,205,155,248,0.355914,0.722222,0.169892,0.333333
9,19.jpg,1200,800,Nurse,658,842,162,463,0.625,0.390625,0.153333,0.37625
10,19.jpg,1200,800,patient,350,625,411,694,0.40625,0.690625,0.229167,0.35375
23,25.jpg,468,180,patient,173,236,30,123,0.436966,0.425,0.134615,0.516667
24,25.jpg,468,180,monitoring system,347,410,27,113,0.808761,0.388889,0.134615,0.477778


In [133]:
# Define label encoding function
def label_encoding(x):
    labels = {'Doctor': 0, 'patient': 1, 'monitoring system': 2, 'Nurse': 3, 'family member': 4}
    return labels[x]


In [134]:
# Apply label encoding to 'name' column
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [135]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,12.jpg,1134,488,Doctor,493,598,113,277,0.481041,0.39959,0.092593,0.336066,0
1,12.jpg,1134,488,patient,162,238,266,351,0.176367,0.632172,0.067019,0.17418,1
2,12.jpg,1134,488,patient,608,679,194,271,0.56746,0.476434,0.06261,0.157787,1
3,12.jpg,1134,488,monitoring system,372,476,134,291,0.373898,0.435451,0.091711,0.321721,2
4,16.jpg,1500,1000,Doctor,811,1030,281,601,0.613667,0.441,0.146,0.32,0
5,16.jpg,1500,1000,patient,619,911,608,855,0.51,0.7315,0.194667,0.247,1
6,16.jpg,1500,1000,Doctor,682,782,364,634,0.488,0.499,0.066667,0.27,0
8,18.jpg,780,585,patient,30,222,75,255,0.161538,0.282051,0.246154,0.307692,1
11,20.jpg,1300,1065,patient,415,557,315,533,0.373846,0.398122,0.109231,0.204695,1
12,20.jpg,1300,1065,monitoring system,803,1062,277,593,0.717308,0.408451,0.199231,0.296714,2


In [152]:
import os
from shutil import move

In [164]:
import os

train_folder = 'data_images/train'
test_folder = 'data_images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)


In [165]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [184]:
#groupby_obj_train.get_group('12.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)

def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [185]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [187]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
dtype: object

In [188]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0    None
1    None
2    None
3    None
4    None
dtype: object

In [51]:
import os
import pandas as pd
from shutil import move

# Sample DataFrame definitions for train and test data
train_df = pd.DataFrame({
    'filename': ['file1.jpg', 'file2.jpg'],
    'id': [1, 2],
    'center_x': [100, 150],
    'center_y': [200, 250],
    'w': [50, 60],
    'h': [80, 90]
})

test_df = pd.DataFrame({
    'filename': ['file3.jpg', 'file4.jpg'],
    'id': [3, 4],
    'center_x': [120, 160],
    'center_y': [220, 260],
    'w': [55, 65],
    'h': [85, 95]
})

# Define paths
train_folder = 'data_images/train'
test_folder = 'data_images/test'

# Check if directories exist before creating them
if not os.path.exists(train_folder):
    os.makedirs(train_folder)
if not os.path.exists(test_folder):
    os.makedirs(test_folder)

# Group by filename
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

# Define filename series
filename_series = pd.Series(['file1.jpg', 'file2.jpg'])

# Function to save data
def save_data(filename, folder_path, group_obj):
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    print(f"Source: {src}")
    print(f"Destination: {dst}")
    if not os.path.exists(src):
        print(f"Source file {src} not found.")
        return None
    move(src, dst)  # move image to the destination folder
    
    # Save the labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
    if filename in group_obj.groups:
        labels = group_obj.get_group(filename)
        with open(text_filename, 'w') as f:
            for index, row in labels.iterrows():
                f.write(f"{row['id']} {row['center_x']} {row['center_y']} {row['w']} {row['h']}\n")
    else:
        print(f"No labels found for {filename}")
    return "Data saved successfully"

# Apply the function
result = filename_series.apply(save_data, args=(train_folder, groupby_obj_train))
print(result)


Source: data_images\file1.jpg
Destination: data_images/train\file1.jpg
Source file data_images\file1.jpg not found.
Source: data_images\file2.jpg
Destination: data_images/train\file2.jpg
Source file data_images\file2.jpg not found.
0    None
1    None
dtype: object


In [53]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

Source: data_images\file1.jpg
Destination: data_images/train\file1.jpg
Source file data_images\file1.jpg not found.
Source: data_images\file2.jpg
Destination: data_images/train\file2.jpg
Source file data_images\file2.jpg not found.


0    None
1    None
dtype: object

In [40]:
import pandas as pd

# Example DataFrame definitions, replace with actual data loading code
train_df = pd.DataFrame({
    'filename': ['file1.jpg', 'file2.jpg'],
    'id': [1, 2],
    'center_x': [100, 150],
    'center_y': [200, 250],
    'w': [50, 60],
    'h': [80, 90]
})

test_df = pd.DataFrame({
    'filename': ['file3.jpg', 'file4.jpg'],
    'id': [3, 4],
    'center_x': [120, 160],
    'center_y': [220, 260],
    'w': [55, 65],
    'h': [85, 95]
})

cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

# Proceed with your code using groupby_obj_train and groupby_obj_test


In [38]:
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [43]:
import pandas as pd
import os

# Define the paths to the CSV files
train_file_path = 'path_to_train_file.csv'
test_file_path = 'path_to_test_file.csv'

# Print the paths to ensure they are correct
print(f"Train file path: {train_file_path}")
print(f"Test file path: {test_file_path}")

# Check if the files exist before trying to read them
if not os.path.exists(train_file_path):
    print(f"Train file not found at {train_file_path}")
else:
    train_df = pd.read_csv(train_file_path)
    print("Train file loaded successfully")

if not os.path.exists(test_file_path):
    print(f"Test file not found at {test_file_path}")
else:
    test_df = pd.read_csv(test_file_path)
    print("Test file loaded successfully")

# Proceed with your code if both files are loaded successfully
if 'train_df' in locals() and 'test_df' in locals():
    cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
    groupby_obj_train = train_df[cols].groupby('filename')
    groupby_obj_test = test_df[cols].groupby('filename')

    # Rest of your code using groupby_obj_train and groupby_obj_test



Train file path: path_to_train_file.csv
Test file path: path_to_test_file.csv
Train file not found at path_to_train_file.csv
Test file not found at path_to_test_file.csv


In [44]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [45]:
import os

def save_data(filename, train_folder, groupby_obj_train):
    filepath = os.path.join(train_folder, filename)
    print(f"Checking path: {filepath}")
    if not os.path.exists(filepath):
        print(f"Source file {filepath} not found.")
        return None
    # Your existing code to save data
    # For example, you can load and process the image here
    # img = load_image(filepath)
    # process_image(img, groupby_obj_train)
    return "Data saved successfully"  # or any relevant return value

# Assuming filename_series is a pandas Series containing filenames
filename_series = pd.Series(['file1.jpg', 'file2.jpg'])

train_folder = 'data_images/train'  # Update this path as needed
groupby_obj_train = None  # Update this with the actual object you are passing

result = filename_series.apply(save_data, args=(train_folder, groupby_obj_train))
print(result)


Checking path: data_images/train\file1.jpg
Source file data_images/train\file1.jpg not found.
Checking path: data_images/train\file2.jpg
Source file data_images/train\file2.jpg not found.
0    None
1    None
dtype: object


In [54]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

Source: data_images\file3.jpg
Destination: data_images/test\file3.jpg
Source file data_images\file3.jpg not found.
Source: data_images\file4.jpg
Destination: data_images/test\file4.jpg
Source file data_images\file4.jpg not found.


0    None
1    None
dtype: object