In [10]:
import xml.etree.ElementTree as ET
import pandas as pd
import os  # Importing os to handle file paths

# Paths to the XML and CSV files
xml_path = r"C:\Users\rajve\Downloads\archive\annotations.xml"
csv_path = r"C:\Users\rajve\Downloads\archive\basketball_tracking.csv"

# Load and parse the XML file
tree = ET.parse(xml_path)
root = tree.getroot()

# Load the CSV file to map image paths
image_df = pd.read_csv(csv_path)

# We're going to store the extracted information in a list
data = []

# Now, we loop through the XML to grab the data we care about
for track in root.findall('track'):
    label = track.get('label')  # Get the label for the current track
    class_index = 0  # For now, we're setting the class index manually (you can adjust this later)

    # Loop through each 'box' tag inside 'track' to get the coordinates and frame info
    for box in track.findall('box'):
        frame = box.get('frame')
        xmin = int(float(box.get('xtl')))  # Top-left x-coordinate
        ymin = int(float(box.get('ytl')))  # Top-left y-coordinate
        xmax = int(float(box.get('xbr')))  # Bottom-right x-coordinate
        ymax = int(float(box.get('ybr')))  # Bottom-right y-coordinate

        # Try to find the image name from the CSV using the frame number
        image_name_row = image_df.loc[image_df['image_id'] == int(frame), 'image_name']
        
        if not image_name_row.empty:
            # If we find a matching image, we grab the name
            image_name = image_name_row.values[0]
        else:
            # If there's no match, we set the image name to None (or 'unknown', up to you)
            image_name = None  

        # Add everything we've extracted as a new row in our data list
        data.append([image_name, class_index, xmin, ymin, xmax, ymax])

# Turn the list into a DataFrame so we can work with it more easily
df = pd.DataFrame(data, columns=['path', 'class_index', 'xmin', 'ymin', 'xmax', 'ymax'])

# Remove any rows where 'path' is missing since we can't do much with those
df = df.dropna(subset=['path'])

# Extract the folder number from the 'path', but only if it's a valid string
df['folder'] = df['path'].apply(lambda x: int(x.split('/')[1]) if isinstance(x, str) and len(x.split('/')) > 1 else None)

# Get rid of rows where we couldn't extract a folder number
df = df.dropna(subset=['folder'])

# Convert the 'folder' column to integers now that we know it's safe
df['folder'] = df['folder'].astype(int)

# Split the data into training and testing sets based on folder numbers
# Here, folders 1-3 are for training, and 4-5 are for testing
train_df = df[df['folder'].isin([1, 2, 3])]  # Use .copy() to avoid the warning
test_df = df[df['folder'].isin([4, 5])]

# Define the base path
base_path = r"C:\Users\rajve\Downloads\archive"

# Modify the 'folder' column to include the base path using .loc to avoid SettingWithCopyWarning
train_df.loc[:, 'path'] = train_df['path'].apply(lambda s: os.path.join(base_path, str(s)))

# Let's take a quick look at the split data
print("Training data:")
print(train_df.head())

print("\nTesting data:")
print(test_df.head())


Training data:
                                               path  class_index  xmin  ymin  \
0   C:\Users\rajve\Downloads\archive\images/1/0.png            0   986   415   
1   C:\Users\rajve\Downloads\archive\images/1/1.png            0   966   408   
2  C:\Users\rajve\Downloads\archive\images/1/10.png            0   947   403   
3  C:\Users\rajve\Downloads\archive\images/1/11.png            0   927   400   
4  C:\Users\rajve\Downloads\archive\images/1/12.png            0   908   397   

   xmax  ymax  folder  
0  1007   436       1  
1   987   429       1  
2   968   424       1  
3   948   421       1  
4   929   418       1  

Testing data:
               path  class_index  xmin  ymin  xmax  ymax  folder
39   images/4/0.png            0   793   486   814   506       4
40   images/4/1.png            0   790   465   809   485       4
41  images/4/10.png            0   782   446   802   468       4
42  images/4/11.png            0   778   431   797   452       4
43  images/4/12.png 