<a href="https://colab.research.google.com/github/SoheliPaul/LLM/blob/main/2348561_LLM_Lab8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pydub
!pip install librosa

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image, UnidentifiedImageError
from pydub import AudioSegment
import librosa
import numpy as np
import torch.nn.functional as F
import os

# Load the CLIP model for handling text and image inputs
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def process_text(text):
    """Process text input using CLIP."""
    inputs = processor(text=[text], return_tensors="pt", padding=True)
    text_features = model.get_text_features(**inputs)
    return text_features

def process_image(image_path):
    """Process image input using CLIP."""
    if not os.path.exists(image_path):
        print(f"Image file not found: {image_path}")
        return None

    try:
        image = Image.open(image_path)
    except UnidentifiedImageError:
        print(f"Cannot identify image file: {image_path}")
        return None

    inputs = processor(images=image, return_tensors="pt", padding=True)
    image_features = model.get_image_features(**inputs)
    return image_features

def process_audio(audio_path):
    """Process audio input using librosa and convert to features."""
    if not os.path.exists(audio_path):
        print(f"Audio file not found: {audio_path}")
        return None

    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1)  # Convert to mono
    samples = np.array(audio.get_array_of_samples())

    # Convert to tensor and normalize
    samples = torch.tensor(samples, dtype=torch.float32)
    samples = samples.unsqueeze(0)  # Add batch dimension
    samples = F.normalize(samples, dim=-1)  # Normalize across the last dimension

    # Extract features (e.g., Mel-spectrogram)
    mel_spec = librosa.feature.melspectrogram(y=samples.squeeze(0).numpy(), sr=audio.frame_rate, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Convert to tensor
    audio_features = torch.tensor(mel_spec_db, dtype=torch.float32)
    audio_features = audio_features.unsqueeze(0)  # Add batch dimension
    return audio_features

def combine_features(features_list, method="concat", normalize=False):
    """Combine features from different modalities."""
    features_list = [f for f in features_list if f is not None]

    if len(features_list) == 0:
        print("No valid features to combine.")
        return None

    if normalize:
        features_list = [F.normalize(f, dim=-1) if f.dim() > 1 else F.normalize(f, dim=0) for f in features_list]

    # Ensure all features have the same number of dimensions
    target_dim = 512
    processed_features = []
    for f in features_list:
        if f.dim() == 3:  # Handle 3D tensors (like audio features)
            f = torch.mean(f, dim=-1)  # Reduce to 2D (from [1, 128, 762] to [1, 128])
            f = F.adaptive_avg_pool1d(f.unsqueeze(0), target_dim).squeeze(0)  # Reshape to [1, 512]
        elif f.dim() == 2 and f.shape[1] != target_dim:
            f = F.adaptive_avg_pool1d(f.unsqueeze(0), target_dim).squeeze(0)  # Reshape to [1, 512]
        processed_features.append(f)

    if method == "concat":
        combined_features = torch.cat(processed_features, dim=-1)
    elif method == "add":
        combined_features = torch.sum(torch.stack(processed_features), dim=0)
    elif method == "mean":
        combined_features = torch.mean(torch.stack(processed_features), dim=0)
    else:
        print(f"Unknown combination method: {method}")
        return None

    return combined_features

def main(text_input, image_path, audio_path, combine_method="concat", normalize=False):
    # Process each modality
    text_features = process_text(text_input)
    image_features = process_image(image_path)
    audio_features = process_audio(audio_path)

    # Print the features (for demonstration purposes)
    if text_features is not None:
        print(f"Text features: {text_features.shape}")
    if image_features is not None:
        print(f"Image features: {image_features.shape}")
    if audio_features is not None:
        print(f"Audio features: {audio_features.shape}")

    # Combine features
    combined_features = combine_features([text_features, image_features, audio_features], method=combine_method, normalize=normalize)
    if combined_features is not None:
        print(f"Combined features shape: {combined_features.shape}")

if __name__ == "__main__":
    text_input = "A photo of a dog"
    image_path = "/content/happy_couple.jpg"
    audio_path = "/content/CoupleAudio.wav"
    combine_method = "concat"
    normalize = False

    main(text_input, image_path, audio_path, combine_method, normalize)


Text features: torch.Size([1, 512])
Image features: torch.Size([1, 512])
Audio features: torch.Size([1, 128, 762])
Combined features shape: torch.Size([1, 1536])


In [None]:
# prompt: Develop a Python script for handling multimodal inputs.

def multimodal_processing(text=None, image_path=None, audio_path=None):
  """Processes multimodal inputs and returns a combined feature representation."""

  feature_list = []
  if text:
    text_features = process_text(text)
    feature_list.append(text_features)
  if image_path:
    image_features = process_image(image_path)
    feature_list.append(image_features)
  if audio_path:
    audio_features = process_audio(audio_path)
    feature_list.append(audio_features)

  # Combine features if any are present
  if feature_list:
    combined_features = torch.cat(feature_list, dim=-1)
    return combined_features
  else:
    return None


In [None]:
text_input = "A photo of a dog"
image_path = "/content/happy couple.jpg"
audio_path = "/content/CoupleAudio.wav"
multimodal_processing(text_input, image_path, audio_path)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)