In [14]:
import pandas as pd

def clean_and_reorganize_data(file_path, output_file):
    """
    Clean and reorganize data into a tabular format with columns:
    'image_name', 'comment_number', 'comment'.

    Args:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the cleaned CSV file.

    Returns:
        None
    """
    try:
        # Load the raw file into memory
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Clean up lines by stripping extra spaces and quotation marks
        cleaned_lines = []
        for line in lines:
            cleaned_line = line.replace('"', '').strip()  # Remove quotation marks
            cleaned_lines.append(cleaned_line)

        # Save the cleaned content temporarily
        temp_file = "temp_cleaned_file.csv"
        with open(temp_file, 'w', encoding='utf-8') as f:
            f.writelines("\n".join(cleaned_lines))

        # Read the cleaned file with a proper delimiter
        data = pd.read_csv(temp_file, delimiter='|', engine='python', skipinitialspace=True)

        # Rename columns and strip whitespace
        data.columns = ['image_name', 'comment_number', 'comment']
        data['image_name'] = data['image_name'].str.strip()
        data['comment_number'] = data['comment_number'].str.strip()
        data['comment'] = data['comment'].str.strip()

        # Debug: Print cleaned data for validation
        print("Cleaned Data:")
        print(data.head())

        # Save the cleaned data to a new CSV file
        data.to_csv(output_file, index=False)
        print(f"Cleaned data saved to {output_file}")
    except Exception as e:
        print(f"Error cleaning and reorganizing data: {e}")

# Example usage
file_path = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\results.csv"  # Path to the raw CSV file
output_file = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\cleaned_data.csv"  # Correct file path with .csv extension

clean_and_reorganize_data(file_path, output_file)



Cleaned Data:
       image_name comment_number  \
0  1000092795.jpg              0   
1  1000092795.jpg              1   
2  1000092795.jpg              2   
3  1000092795.jpg              3   
4  1000092795.jpg              4   

                                             comment  
0  Two young guys with shaggy hair look at their ...  
1  Two young , White males are outside near many ...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blue shirt standing in a garden .  
4            Two friends enjoy time spent together .  
Cleaned data saved to C:\Users\nputta\Downloads\OMVK_MS_PROJECT\cleaned_data.csv


In [15]:
import pandas as pd

def clean_and_reorganize_data(file_path, output_file):
    """
    Clean and reorganize data into a tabular format with columns:
    'image_name', 'comment_number', 'comment'.

    Args:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the cleaned CSV file.

    Returns:
        None
    """
    try:
        # Load the raw file into memory
        print(f"Reading the file: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Debug: Check if lines are read
        if not lines:
            print("Error: Input file is empty or unreadable.")
            return
        
        # Clean up lines by stripping extra spaces and quotation marks
        cleaned_lines = []
        for line in lines:
            cleaned_line = line.replace('"', '').strip()  # Remove quotation marks
            cleaned_lines.append(cleaned_line)

        # Debug: Print first few cleaned lines
        print("Cleaned Lines (First 5):")
        print(cleaned_lines[:5])

        # Save the cleaned content temporarily
        temp_file = "temp_cleaned_file.csv"
        with open(temp_file, 'w', encoding='utf-8') as f:
            f.write("\n".join(cleaned_lines))

        # Read the cleaned file with a proper delimiter
        print(f"Reading temporary cleaned file: {temp_file}")
        data = pd.read_csv(temp_file, delimiter='|', engine='python', skipinitialspace=True)

        # Debug: Check if data is loaded
        if data.empty:
            print("Error: No data loaded from the temporary file.")
            return
        
        # Rename columns and strip whitespace
        data.columns = ['image_name', 'comment_number', 'comment']
        data['image_name'] = data['image_name'].str.strip()
        data['comment_number'] = data['comment_number'].str.strip()
        data['comment'] = data['comment'].str.strip()

        # Debug: Print cleaned data for validation
        print("Cleaned Data (First 5 Rows):")
        print(data.head())

        # Save the cleaned data to a new CSV file
        print(f"Saving cleaned data to: {output_file}")
        data.to_csv(output_file, index=False)
        print(f"Cleaned data saved to {output_file}")
    except Exception as e:
        print(f"Error cleaning and reorganizing data: {e}")

# Example usage
file_path = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\results.csv"  # Path to the raw CSV file
output_file = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\cleaned_data.csv"  # Path to save the cleaned CSV file

clean_and_reorganize_data(file_path, output_file)


Reading the file: C:\Users\nputta\Downloads\OMVK_MS_PROJECT\results.csv
Cleaned Lines (First 5):
['image_name| comment_number| comment', '1000092795.jpg| 0| Two young guys with shaggy hair look at their hands while hanging out in the yard .', '1000092795.jpg| 1| Two young , White males are outside near many bushes .', '1000092795.jpg| 2| Two men in green shirts are standing in a yard .', '1000092795.jpg| 3| A man in a blue shirt standing in a garden .']
Reading temporary cleaned file: temp_cleaned_file.csv
Cleaned Data (First 5 Rows):
       image_name comment_number  \
0  1000092795.jpg              0   
1  1000092795.jpg              1   
2  1000092795.jpg              2   
3  1000092795.jpg              3   
4  1000092795.jpg              4   

                                             comment  
0  Two young guys with shaggy hair look at their ...  
1  Two young , White males are outside near many ...  
2   Two men in green shirts are standing in a yard .  
3       A man in a blu

In [16]:
import pandas as pd
import os
import json

def create_image_caption_json(csv_file_path, image_folder, output_json_file):
    """
    Create a JSON file mapping image filenames to their associated captions.

    Args:
        csv_file_path (str): Path to the cleaned CSV file.
        image_folder (str): Path to the folder containing image files.
        output_json_file (str): Path to save the resulting JSON file.

    Returns:
        None
    """
    try:
        # Load the CSV file into a DataFrame
        print(f"Loading CSV file: {csv_file_path}")
        data = pd.read_csv(csv_file_path)

        # Validate the required columns
        required_columns = ['image_name', 'comment_number', 'comment']
        for column in required_columns:
            if column not in data.columns:
                print(f"Error: Column '{column}' not found in CSV file.")
                return

        # Group captions by image_name
        grouped_data = data.groupby('image_name')['comment'].apply(list).to_dict()

        # Match image names with actual files in the image folder
        image_caption_mapping = {}
        for image_name, captions in grouped_data.items():
            image_path = os.path.join(image_folder, image_name)
            if os.path.exists(image_path):
                image_caption_mapping[image_name] = captions
            else:
                print(f"Warning: Image '{image_name}' not found in {image_folder}")

        # Debug: Print the generated mapping
        print("Generated Image-Caption Mapping (Preview):")
        print(json.dumps(image_caption_mapping, indent=4)[:500])  # Print first 500 characters

        # Save the result as a JSON file
        print(f"Saving JSON file to: {output_json_file}")
        with open(output_json_file, 'w') as json_file:
            json.dump(image_caption_mapping, json_file, indent=4)

        print(f"JSON file saved successfully to {output_json_file}")
    except Exception as e:
        print(f"Error creating image-caption JSON: {e}")

# Example usage
csv_file_path = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\cleaned_data.csv"  # Path to the cleaned CSV file
image_folder = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\flickr30k_images"  # Path to the folder containing images
output_json_file = r"C:\Users\nputta\OneDrive - California State University, Sacramento\De"  # Path to save JSON file

create_image_caption_json(csv_file_path, image_folder, output_json_file)


Loading CSV file: C:\Users\nputta\Downloads\OMVK_MS_PROJECT\cleaned_data.csv
Generated Image-Caption Mapping (Preview):
{
    "1000092795.jpg": [
        "Two young guys with shaggy hair look at their hands while hanging out in the yard .",
        "Two young , White males are outside near many bushes .",
        "Two men in green shirts are standing in a yard .",
        "A man in a blue shirt standing in a garden .",
        "Two friends enjoy time spent together ."
    ],
    "10002456.jpg": [
        "Several men in hard hats are operating a giant pulley system .",
        "Workers look down from up above on 
Saving JSON file to: C:\Users\nputta\OneDrive - California State University, Sacramento\De
JSON file saved successfully to C:\Users\nputta\OneDrive - California State University, Sacramento\De


In [17]:
print(output_json_file)

C:\Users\nputta\OneDrive - California State University, Sacramento\De


In [12]:
import json
import os

# Load JSON file
json_file_path = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\Project_outputs.json.temp"
image_folder = r"C:\Users\nputta\Downloads\OMVK_MS_PROJECT\flickr30k_images" 

with open(json_file_path, 'r') as f:
    data = json.load(f)

# Map images to captions
image_paths = [os.path.join(image_folder, img) for img in data.keys()]
captions = list(data.values())
print(f"Total Images: {len(image_paths)}, Total Captions: {len(captions)}")


Total Images: 31783, Total Captions: 31783


In [13]:
from sklearn.model_selection import train_test_split

# Split into training and validation
train_image_paths, val_image_paths, train_captions, val_captions = train_test_split(
    image_paths, captions, test_size=0.2, random_state=42
)
