In [22]:
import os
import pandas as pd
import numpy as np
import scipy.io
from pathlib import Path

# Define dataset directory
dataset_dir = "../datasets"

# Function to automatically detect delimiter in CSV files
def detect_delimiter(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        first_line = file.readline()
        if ',' in first_line:
            return ','
        elif ';' in first_line:
            return ';'
        elif '\t' in first_line:
            return '\t'
        else:
            return None  # Default to pandas auto-detection

# Load all CSV files in the datasets directory
csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
dataframes = {}

for csv_file in csv_files:
    file_path = os.path.join(dataset_dir, csv_file)
    delimiter = detect_delimiter(file_path)
    
    try:
        # Try with detected delimiter
        df = pd.read_csv(file_path, delimiter=delimiter)
        dataframes[csv_file] = df
        print(f"Successfully loaded {csv_file} with delimiter '{delimiter}'")
    except:
        # If it fails, let pandas try to figure it out
        df = pd.read_csv(file_path, sep=None, engine='python')
        dataframes[csv_file] = df
        print(f"Successfully loaded {csv_file} with auto-detected delimiter")

# Display information about each dataframe
for name, df in dataframes.items():
    print(f"\n=== {name} ===")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("Sample data:")
    print(df.head(3))

Successfully loaded all-vehicles-model.csv with delimiter ';'


  df = pd.read_csv(file_path, delimiter=delimiter)


Successfully loaded fullspecs.csv with delimiter ','
Successfully loaded used-car-with-prices.csv with delimiter ','

=== all-vehicles-model.csv ===
Shape: (47523, 84)
Columns: ['Make', 'Model', 'Annual Petroleum Consumption For Fuel Type1', 'Annual Petroleum Consumption For Fuel Type2', 'Time to charge at 120V', 'Time to charge at 240V', 'City Mpg For Fuel Type1', 'Unrounded City Mpg For Fuel Type1 (2)', 'City Mpg For Fuel Type2', 'Unrounded City Mpg For Fuel Type2', 'City gasoline consumption', 'City electricity consumption', 'EPA city utility factor', 'Co2 Fuel Type1', 'Co2 Fuel Type2', 'Co2  Tailpipe For Fuel Type2', 'Co2  Tailpipe For Fuel Type1', 'Combined Mpg For Fuel Type1', 'Unrounded Combined Mpg For Fuel Type1', 'Combined Mpg For Fuel Type2', 'Unrounded Combined Mpg For Fuel Type2', 'Combined electricity consumption', 'Combined gasoline consumption', 'EPA combined utility factor', 'Cylinders', 'Engine displacement', 'Drive', 'EPA model type index', 'Engine descriptor', 'EPA 

Creating annotations

In [23]:
def create_car_annotations(dataframes):
    annotations = {}
    all_car_descriptions = []
    
    # Based on the used-car-with-prices.csv file which has Brand and Model columns
    if 'used-car-with-prices.csv' in dataframes:
        df = dataframes['used-car-with-prices.csv']
        
        # Get unique combinations of Brand, Model and Year
        unique_cars = df[['Brand', 'Model', 'Year']].drop_duplicates()
        
        # Create an array of car descriptions in the format "Make Model Year"
        car_descriptions = []
        for _, row in unique_cars.iterrows():
            # Extract the actual model name (removing the brand name if it appears at the beginning)
            model = row['Model']
            brand = row['Brand']
            if model.startswith(brand):
                model = model[len(brand):].strip()
                
            # Format: "Brand Model Year"
            description = f"{brand} {model} {row['Year']}"
            all_car_descriptions.append(description)
            
    
    if 'all-vehicles-model.csv' in dataframes:
        df = dataframes['all-vehicles-model.csv']
        
        unique_cars = df[['Make', 'Model', 'Year']].drop_duplicates()
        
        car_descriptions = []
        for _, row in unique_cars.iterrows():
            model = row['Model']
            brand = row['Make']
            if model.startswith(brand):
                model = model[len(brand):].strip()
                
            # Format: "Brand Model Year"
            description = f"{brand} {model} {row['Year']}"
            all_car_descriptions.append(description)

    
    # Process fullspecs.csv which has a different structure
    if 'fullspecs.csv' in dataframes:
        df = dataframes['fullspecs.csv']
        
        # extracting the strings correctly through regex
        import re
        
        for col in df.columns:
            # Match pattern like "2018 Acura RDX Specs: FWD w/Technology/AcuraWatch Plus Pkg"
            match = re.match(r'(\d{4})\s+([A-Za-z]+)\s+([A-Za-z0-9]+)', col)
            if match:
                year = match.group(1)
                brand = match.group(2)
                model = match.group(3)
                
                # Format: "Brand Model" (without Year)
                description = f"{brand} {model} {year}"
                all_car_descriptions.append(description)
    
    # Remove duplicates and create formatted annotations
    unique_descriptions = list(set(all_car_descriptions))
        
    # Create numpy arrays in the format shown in your example
    formatted_annotations = []
    if unique_descriptions:
        max_length = max(len(desc) for desc in unique_descriptions)
        
        for desc in unique_descriptions:
            formatted_annotations.append(np.array([desc], dtype=f'<U{max_length}'))
        
        annotations['car_annotations'] = np.array(formatted_annotations, dtype=object)
    
    return annotations

# Create annotations
car_annotations = create_car_annotations(dataframes)

# Sample output to verify format
print("\nSample of car annotations:")
if 'car_annotations' in car_annotations:
    for i in range(min(50, len(car_annotations['car_annotations']))):
        print(f"        {car_annotations['car_annotations'][i]},")


Sample of car annotations:
        ['Ford Escape Hybrid 4WD 2008'],
        ['BMW Z3 Coupe 1999'],
        ['Mercedes-Benz CLK320 (Cabriolet) 2001'],
        ['GMC Canyon 2WD 2007'],
        ['Chevrolet Van 15/25 2WD Conversion 2006'],
        ['Hyundai Grand i10 Magna 2015'],
        ['MINI John Cooper Works Hardtop 2 door 2024'],
        ['Mercedes-Benz AMG GLC63 S 4matic Plus Coupe 2020'],
        ['Ford Taurus Wagon 2001'],
        ['Nissan Rogue Hybrid  AWD 2017'],
        ['Infiniti QX60 AWD 2014'],
        ['Honda Prologue AWD Elite 2024'],
        ['MINI Cooper S Countryman 2011'],
        ['Acura MDX 4WD 2016'],
        ['Ford Mustang HO Convertible 2020'],
        ['Porsche 924 S 1987'],
        ['Eagle Vision 1993'],
        ['Mercury Milan 2007'],
        ['Volkswagen CC 4motion 2010'],
        ['Mazda CX-9 4WD 2023'],
        ['Infiniti QX56 4WD 2009'],
        ['BMW Z4 M40i 2022'],
        ['Honda Fit EV 2014'],
        ['Acura MDX 2019'],
        ['Toyota 4Runner 2013']

In [24]:
from scipy.io import loadmat
import torch
from torch.utils.data import Dataset
import os
from PIL import Image

class CarImageDataset(Dataset):
    def __init__(self, mat_file, image_dir, transform=None):
        # Load annotations from MAT file
        data = loadmat(mat_file)
        self.annotations = data['car_annotations']
        self.image_dir = image_dir
        self.transform = transform
        
        # Create mapping from car descriptions to image files
        self.image_mapping = self._map_descriptions_to_images()
        
    def _map_descriptions_to_images(self):
        """Map car descriptions to corresponding image files"""
        mapping = {}
        for filename in os.listdir(self.image_dir):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                # Extract car info from filename - adjust based on your naming convention
                # Example: "honda_civic_2018.jpg" -> "Honda Civic 2018"
                parts = os.path.splitext(filename)[0].split('_')
                if len(parts) >= 3:
                    make = parts[0].capitalize()
                    model = parts[1].capitalize()
                    year = parts[2]
                    description = f"{make} {model} {year}"
                    mapping[description] = filename
        return mapping
        
    def __len__(self):
        return len(self.annotations)
        
    def __getitem__(self, idx):
        # Get car description
        description = self.annotations[idx][0][0]
        
        # Find matching image file
        if description in self.image_mapping:
            img_path = os.path.join(self.image_dir, self.image_mapping[description])
            image = Image.open(img_path).convert('RGB')
            
            if self.transform:
                image = self.transform(image)
                
            # Parse description components
            parts = description.split()
            if len(parts) >= 3:
                make = parts[0]
                model = ' '.join(parts[1:-1])
                year = parts[-1]
            else:
                make, model, year = 'Unknown', 'Unknown', 'Unknown'
                
            return image, make, model, year
        else:
            # Handle case where no matching image exists
            # Return placeholder or raise exception
            raise ValueError(f"No image found for description: {description}")

Saving the annotations

In [None]:
# Save annotations to .mat file
def save_mat_file(annotations, output_path):
    scipy.io.savemat(output_path, annotations)
    print(f"Annotations saved to {output_path}")

# Save the annotations
save_mat_file(car_annotations, 'cars_annotations_v2.mat')

Annotations saved to car_annotations.mat


In [28]:
# View contents of the saved MAT file
def inspect_mat_file(filepath):
    print(f"\n{'='*50}")
    print(f"Inspecting MAT file: {filepath}")
    
    # Load the MAT file
    mat_data = scipy.io.loadmat(filepath)
    
    # Show all non-system variables
    variables = [k for k in mat_data.keys() if not k.startswith('__')]
    print(f"Variables in file: {variables}")
    
    # Examine each variable
    for var_name in variables:
        data = mat_data[var_name]
        print(f"\nVariable: '{var_name}'")
        print(f"Type: {type(data)}")
        print(f"Shape: {data.shape}")
        
        # Show sample data (first 5 entries)
        print("Sample data:")
        for i in range(min(5, len(data))):
            print(f"  {i+1}. {data[i]}")

# Inspect the MAT file you created
inspect_mat_file('cars_annotations_v1.mat')


Inspecting MAT file: cars_annotations_v1.mat
Variables in file: ['annotations', 'class_names']

Variable: 'annotations'
Type: <class 'numpy.ndarray'>
Shape: (1, 16185)
Sample data:
  1. [(array(['car_ims/000001.jpg'], dtype='<U18'), array([[112]], dtype=uint8), array([[7]], dtype=uint8), array([[853]], dtype=uint16), array([[717]], dtype=uint16), array([[1]], dtype=uint8), array([[0]], dtype=uint8))
 (array(['car_ims/000002.jpg'], dtype='<U18'), array([[48]], dtype=uint8), array([[24]], dtype=uint8), array([[441]], dtype=uint16), array([[202]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8))
 (array(['car_ims/000003.jpg'], dtype='<U18'), array([[7]], dtype=uint8), array([[4]], dtype=uint8), array([[277]], dtype=uint16), array([[180]], dtype=uint8), array([[1]], dtype=uint8), array([[0]], dtype=uint8))
 ...
 (array(['car_ims/016183.jpg'], dtype='<U18'), array([[25]], dtype=uint8), array([[32]], dtype=uint8), array([[587]], dtype=uint16), array([[359]], dtype=uint16)