<a href="https://colab.research.google.com/github/VicDc/VIC_/blob/OPIT/8003_A3_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ------------------------------------------------------------------------------
# Multimodal Sentiment Analysis System
# --------------------------------------

In [None]:
!pip uninstall gensim -y # -y automatically answers "yes" to the uninstall prompt
!pip install gensim --no-binary :all:

Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting gensim
  Downloading gensim-4.3.3.tar.gz (23.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.3/23.3 MB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:

# Imports
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm  # Import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#import gensim
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Conv2D, MaxPooling2D, Flatten, concatenate, Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
pip install gensim

In [14]:
# ------------------------------------------------------------------------------
# 1. Dataset Preparation
# ------------------------------------------------------------------------------

# Load the dataset
excel_file = "LabeledText.xlsx"
df = pd.read_excel(excel_file)

# Define image directories
image_base_path = "/content/drive/MyDrive/-Images"
image_folders = {
    "negative": os.path.join(image_base_path, "Negative"),
    "neutral": os.path.join(image_base_path, "Neutral"),
    "positive": os.path.join(image_base_path, "Positive"),
}

# Create a function to get the full image path
def get_image_path(filename, label):
    # Determine the folder based on the label
    folder_name = label.lower()
    if folder_name in image_folders:
        return os.path.join(image_folders[folder_name], filename)
    else:
        return None  # Handle cases where label doesn't match a folder

#  Create a new column 'image_path' by applying this function
df['image_path'] = df.apply(lambda row: get_image_path(row['File Name'], row['LABEL']), axis=1)


# Drop rows where image path is not found and remove rows with missing values
df.dropna(subset=['image_path', 'Caption', 'LABEL'], inplace=True)

In [20]:
# ------------------------------------------------------------------------------
# 2. NLP Component (Text Analysis)
# ------------------------------------------------------------------------------
# --- 2.1 Text Preprocessing ---
def preprocess_text(text):
    # Check if the input is a string
    if not isinstance(text, str):
      return ""  # Return empty string for non-string inputs

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

df['processed_text'] = df['Caption'].apply(preprocess_text)


# --- 2.2 Feature Extraction (Word Embeddings) ---

# Tokenize the text (this part is good as is)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['processed_text'])
sequences = tokenizer.texts_to_sequences(df['processed_text'])

# Pad sequences (also good)
max_sequence_length = max(len(s) for s in sequences)  # Find the max sequence length *after* removing NaNs
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)


# --- Word2Vec (optimized) ---
sentences = [text.split() for text in df['processed_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create the embedding matrix more efficiently
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))  # Initialize
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = word2vec_model.wv[word]  # Directly access vector
        embedding_matrix[i] = embedding_vector
    except KeyError:
        # Word not in Word2Vec vocabulary - leave embedding as zero
        pass  # Or handle differently (e.g., use a random vector)


# --- 2.3 Text Sentiment Classification Model ---

text_input = Input(shape=(max_sequence_length,), name='text_input')
embedding_layer = Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(text_input)
lstm_layer = LSTM(128)(embedding_layer)
text_output = Dense(128, activation='relu')(lstm_layer)  # Output for fusion

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# ------------------------------------------------------------------------------
# 3. Computer Vision Component (Image Analysis)
# ------------------------------------------------------------------------------

def preprocess_images(image_paths, target_size=(224, 224)):
    """
    Preprocesses images for a CNN.

    Args:
        image_paths (list): List of image file paths.
        target_size (tuple): Target image size (height, width).

    Returns:
        numpy.ndarray: Array of preprocessed images.
    """
    images = []
    for path in image_paths:
        img = load_img(path, target_size=target_size)
        img_array = img_to_array(img)
        img_array = preprocess_input(img_array)  # Use appropriate preprocess_input
        images.append(img_array)
    return np.array(images)


def create_cv_model(input_shape=(224, 224, 3)):
    """Creates a CNN model for sentiment classification."""
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False  # Freeze pre-trained layers initially

    x = base_model.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)  # Add some dense layers
    output = Dense(3, activation='softmax')(x)  # 3 output classes

    model = Model(inputs=base_model.input, outputs=output)
    return model



# --- Preprocess Image Data ---
train_images = preprocess_images(train_df['image_path'].tolist())
val_images = preprocess_images(val_df['image_path'].tolist())
test_images = preprocess_images(test_df['image_path'].tolist())

train_image_labels = train_df['LABEL'].values
val_image_labels = val_df['LABEL'].values
test_image_labels = test_df['LABEL'].values


# --- Create and Compile CV Model ---
cv_model = create_cv_model()
cv_model.compile(optimizer='adam',  # You can adjust the optimizer
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])

# --- Train CV Model ---
cv_history = cv_model.fit(
    train_images, train_image_labels,
    validation_data=(val_images, val_image_labels),
    epochs=10,  # Adjust as needed.  More epochs for image data.
    batch_size=16  # Adjust as needed
)




In [None]:
# ------------------------------------------------------------------------------
# 4. Fusion and Final Classification
# ------------------------------------------------------------------------------

def create_fusion_model(nlp_model, cv_model):
    """Creates the multimodal fusion model."""

    # Get NLP model's output (before the final classification layer)
    nlp_output = nlp_model.layers[-2].output

    # Get CV model's output (before the final classification layer)
    cv_output = cv_model.layers[-2].output

    # Concatenate the outputs
    concatenated = concatenate([nlp_output, cv_output])

    # Add a few fully connected layers for the final classification
    x = Dense(128, activation='relu')(concatenated)
    x = tf.keras.layers.Dropout(0.2)(x)  # Add dropout for regularization
    final_output = Dense(3, activation='softmax')(x)

    # Create the fusion model
    fusion_model = Model(inputs=[nlp_model.input, cv_model.input], outputs=final_output)

    return fusion_model

# --- Create Fusion Model ---
fusion_model = create_fusion_model(nlp_model, cv_model)

# --- Compile Fusion Model ---
fusion_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),  # Smaller learning rate
                    loss='sparse_categorical_crossentropy',
                    metrics=['accuracy'])

# --- Prepare Data for Fusion Model ---

# Train data
train_fusion_input = [train_text_inputs, train_images]
train_fusion_labels = train_image_labels  # Use image labels (they should be the same as text)

# Validation data
val_fusion_input = [val_text_inputs, val_images]
val_fusion_labels = val_image_labels

# Test data (for final evaluation later)
test_fusion_input = [test_text_inputs, test_images]
test_fusion_labels = test_image_labels

# --- Train Fusion Model ---
fusion_history = fusion_model.fit(
    train_fusion_input, train_fusion_labels,
    validation_data=(val_fusion_input, val_fusion_labels),
    epochs=10, # fine-tuning
    batch_size=16
)



In [None]:

# ------------------------------------------------------------------------------
# 5. Evaluation and Report
# ------------------------------------------------------------------------------

def evaluate_model(model, inputs, labels):
    """Evaluates the model and prints the results."""
    predictions = model.predict(inputs)
    predicted_labels = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predicted_labels)
    report = classification_report(labels, predicted_labels, target_names=['Negative', 'Neutral', 'Positive'])

    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(labels, predicted_labels, target_names=['Negative', 'Neutral', 'Positive']))

    return accuracy, report


# --- Evaluate NLP Model ---
print("\n--- NLP Model Evaluation ---")
nlp_accuracy, nlp_report = evaluate_model(nlp_model, test_text_inputs, test_text_labels)


# --- Evaluate CV Model ---
print("\n--- CV Model Evaluation ---")
cv_accuracy, cv_report = evaluate_model(cv_model, test_images, test_image_labels)


# --- Evaluate Fusion Model ---
print("\n--- Fusion Model Evaluation ---")
fusion_accuracy, fusion_report = evaluate_model(fusion_model, test_fusion_input, test_fusion_labels)


# --- Plot Training History (Optional) ---
def plot_training_history(history, title):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'{title} - Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'{title} - Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Example Usage
plot_training_history(nlp_history, "NLP Model Training")
plot_training_history(cv_history, "CV Model Training")
plot_training_history(fusion_history, "Fusion Model Training")



# --- Save Models (Optional) ---

nlp_model.save("nlp_model.h5")
cv_model.save("cv_model.h5")
fusion_model.save("fusion_model.h5")



In [None]:

# --- Create Report (See below) ---
# Create a report.md or report.pdf file summarizing the project,
# including the information requested in the prompt.  The next
# cell provides a basic structure.



# ------------------------------------------------------------------------------
#  REPORT (Example Structure - Save this to a separate report.md or .pdf)
# ------------------------------------------------------------------------------

#  # Multimodal Sentiment Analysis Report

#  ## 1. Introduction

#  Briefly describe the project's goal and the dataset used.

#  ## 2. Data Preprocessing

#  ### 2.1 Text Data
#  - Describe the steps: tokenization, stopword removal (if any), lowercasing,
#    and the specific tokenizer used (BERT).
#  - Explain the `max_len` parameter and why it's important.
#  - Mention how special tokens ([CLS], [SEP]) are handled.

#  ### 2.2 Image Data
#  - Describe resizing (mention the target size).
#  - Explain normalization and the `preprocess_input` function used (specific to ResNet).

#  ## 3. Model Architectures

#  ### 3.1 NLP Model
#  - Describe the use of BERT (bert-base-uncased).
#  - Explain the input layers (input_ids, attention_mask, token_type_ids).
#  - Describe the output layer (Dense with softmax for 3 classes).
#  - Include a diagram of the model architecture if possible (you can use tools to generate this).

#  ### 3.2 Computer Vision Model
#  - Describe the use of ResNet50 (pretrained on ImageNet).
#  - Explain why you froze the base model's layers (at least initially).
#  - Describe the added layers (GlobalAveragePooling2D, Dense, and the final output layer).
#  - Include a diagram.

#  ### 3.3 Fusion Model
#  - Explain how you combined the NLP and CV models (concatenation).
#  - Describe the added layers (Dense, Dropout, and the final output layer).
#  - Include a diagram.

#  ## 4. Results and Observations

#  ### 4.1 Performance Metrics
#     present the results:
#      Accuracy, Precision, Recall, F1-score.

#  ### 4.2 Training Curves (optional)
#  - Include the training curves (accuracy and loss) for each model (NLP, CV, Fusion).

#  ### 4.3 Challenges
#  - Discuss any challenges you faced:
#    - Data imbalance.
#    - Overfitting/underfitting.
#    - Choosing hyperparameters (learning rate, batch size, epochs).
#    - Computational limitations.
#    - Difficulty in combining models.

#  ### 4.4  Discussion and Conclusion
#     briefly discuss about obtained results and suggest possible improvements.

#  ## 5. Conclusion

#  Summarize the project and its findings.  Suggest future improvements or extensions:
#    - Trying different model architectures (e.g., different CNNs, other Transformers).
#    - Using different fusion techniques (e.g., weighted averaging, attention mechanisms).
#    - Addressing data imbalance (e.g., oversampling, data augmentation).
#    - More hyperparameter tuning.
#    - Fine

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer  # Or use Lemmatization (WordNetLemmatizer)
import re

def preprocess_text(text):
    """Preprocesses text data."""
    if not isinstance(text, str):  # Handle potential missing values
        return ""

    # 1. Lowercasing
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 3. Remove mentions and hashtags
    text = re.sub(r'@\w+|\#','', text)

    # 4. Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text) # Keep alphanumeric characters and spaces
    text = re.sub(r'\d+', '', text)

    # 5. Tokenization
    tokens = word_tokenize(text)

    # 6. Stopword Removal (ensure stopwords are downloaded: nltk.download('stopwords'))
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]

    # 7. Stemming (or Lemmatization)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(w) for w in filtered_tokens]
    # Alternative: Lemmatization (requires nltk.download('wordnet') and nltk.download('omw-1.4'))
    # from nltk.stem import WordNetLemmatizer
    # lemmatizer = WordNetLemmatizer()
    # lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]


    return " ".join(stemmed_tokens) # Return as a string, important for vectorization

def preprocess_text_data(csv_path):
    """Loads, preprocesses, and returns text data."""
    df = pd.read_csv(csv_path)
    df['processed_text'] = df['text'].apply(preprocess_text)
    return df

# Example Usage (you'll call this from train.py)
if __name__ == '__main__':
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    processed_df = preprocess_text_data("data/data.csv")  # Replace with your CSV path
    print(processed_df.head())
    processed_df.to_csv("data/processed_data.csv", index=False) # Save the preprocessed data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


FileNotFoundError: [Errno 2] No such file or directory: 'data/data.csv'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report

def train_text_model(processed_csv_path, model_save_path="text_model"):
    """Trains a BERT-based text sentiment classification model."""

    df = pd.read_csv(processed_csv_path)

    # Handle missing data explicitly
    df = df.dropna(subset=['processed_text', 'sentiment'])

    # Convert sentiment labels to numerical values
    sentiment_map = {'positive': 0, 'negative': 1, 'neutral': 2}
    df['label'] = df['sentiment'].map(sentiment_map)
    if df['label'].isnull().any():
        raise ValueError("Invalid sentiment labels found in the dataset.")

    # Split data into training and validation sets
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # 1. Load Pre-trained BERT Tokenizer and Model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # 3 classes

    # 2. Tokenize and Prepare Data for BERT
    def prepare_data(df, tokenizer):
      input_ids = []
      attention_masks = []

      for text in df['processed_text']:
          encoded_dict = tokenizer.encode_plus(
              text,
              add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
              max_length=128,          # Pad & truncate all sentences.
              padding='max_length',
              truncation=True,
              return_attention_mask=True,   # Construct attn. masks.
              return_tensors='pt',     # Return pytorch tensors.
          )
          input_ids.append(encoded_dict['input_ids'])
          attention_masks.append(encoded_dict['attention_mask'])
      return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)


    train_inputs, train_masks = prepare_data(train_df, tokenizer)
    train_labels = torch.tensor(train_df['label'].values)
    val_inputs, val_masks = prepare_data(val_df, tokenizer)
    val_labels = torch.tensor(val_df['label'].values)

    # 3. Create DataLoaders
    batch_size = 32  # Adjust as needed

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_dataloader = DataLoader(val_data, batch_size=batch_size)


    # 4. Set up Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    epochs = 3  # Adjust as needed

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


    # 5. Training Loop
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs, masks, labels = batch

            model.zero_grad()
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}")

    # 6. Evaluation
    model.eval()
    predictions, true_labels = [], []

    for batch in val_dataloader:
      batch = tuple(t.to(device) for t in batch)
      inputs, masks, labels = batch

      with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)

      logits = outputs.logits
      predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
      true_labels.extend(labels.cpu().numpy())

    print(classification_report(true_labels, predictions, target_names=sentiment_map.keys()))
    print("Accuracy:", accuracy_score(true_labels, predictions))


    # 7. Save the Model
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

if __name__ == '__main__':
    train_text_model("data/processed_data.csv")

In [None]:
from PIL import Image
import os
import pandas as pd
import numpy as np

def preprocess_images(image_dir, csv_path, output_dir, target_size=(224, 224)):
    """Preprocesses images (resizing and normalizing)."""
    df = pd.read_csv(csv_path)
    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist
    processed_image_paths = []

    for index, row in df.iterrows():
        image_path = os.path.join(image_dir, row['image_path'])
        try:
            img = Image.open(image_path).convert('RGB')  # Handle potential grayscale images
            img = img.resize(target_size)
            # Normalization (using ImageNet statistics - common practice)
            img_array = np.array(img) / 255.0
            mean = np.array([0.485, 0.456, 0.406])
            std = np.array([0.229, 0.224, 0.225])
            img_array = (img_array - mean) / std

            # Save the processed image
            processed_image_path = os.path.join(output_dir, f"processed_{index}.npy")
            np.save(processed_image_path, img_array) # Save as numpy array for efficiency
            processed_image_paths.append(processed_image_path)

        except (FileNotFoundError, OSError) as e:
            print(f"Error processing image {image_path}: {e}")
            processed_image_paths.append(None) # Important: Keep track of failed images

    df['processed_image_path'] = processed_image_paths
    return df

# Example Usage (called from train.py)
if __name__ == '__main__':
    updated_df = preprocess_images("data/images", "data/data.csv", "data/processed_images")
    updated_df.to_csv("data/data_with_processed_images.csv", index=False) # Update the CSV
    print(updated_df.head())

In [None]:
from PIL import Image
import os
import pandas as pd
import numpy as np

def preprocess_images(image_dir, csv_path, output_dir, target_size=(224, 224)):
    """Preprocesses images (resizing and normalizing)."""
    df = pd.read_csv(csv_path)
    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist
    processed_image_paths = []

    for index, row in df.iterrows():
        image_path = os.path.join(image_dir, row['image_path'])
        try:
            img = Image.open(image_path).convert('RGB')  # Handle potential grayscale images
            img = img.resize(target_size)
            # Normalization (using ImageNet statistics - common practice)
            img_array = np.array(img) / 255.0
            mean = np.array([0.485, 0.456, 0.406])
            std = np.array([0.229, 0.224, 0.225])
            img_array = (img_array - mean) / std

            # Save the processed image
            processed_image_path = os.path.join(output_dir, f"processed_{index}.npy")
            np.save(processed_image_path, img_array) # Save as numpy array for efficiency
            processed_image_paths.append(processed_image_path)

        except (FileNotFoundError, OSError) as e:
            print(f"Error processing image {image_path}: {e}")
            processed_image_paths.append(None) # Important: Keep track of failed images

    df['processed_image_path'] = processed_image_paths
    return df

# Example Usage (called from train.py)
if __name__ == '__main__':
    updated_df = preprocess_images("data/images", "data/data.csv", "data/processed_images")
    updated_df.to_csv("data/data_with_processed_images.csv", index=False) # Update the CSV
    print(updated_df.head())