In [None]:
import pandas as pd

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/multimodal_train.tsv',delimiter='\t')
df_test = pd.read_csv('/content/drive/MyDrive/multimodal_test_public.tsv',delimiter='\t')

In [None]:
df_train.shape

(564000, 16)

In [None]:
df_test.shape

(59319, 16)

In [None]:
df_train = df_train.iloc[:205000, ]

In [None]:
df_train.head()

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label
0,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,mildlyinteresting,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0
1,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,pareidolia,This concerned sink with a tiny hat,0.99,0,2,2
2,prometheus1123,hackers leak emails from uae ambassador to us,1496511000.0,aljazeera.com,True,6f2cy5,https://external-preview.redd.it/6fNhdbc6K1vFA...,,1.0,44,neutralnews,Hackers leak emails from UAE ambassador to US,0.92,1,0,0
3,,puppy taking in the view,1471341000.0,i.imgur.com,True,4xypkv,https://external-preview.redd.it/HLtVNhTR6wtYt...,,26.0,250,photoshopbattles,PsBattle: Puppy taking in the view,0.95,1,0,0
4,3rikR3ith,i found a face in my sheet music too,1525318000.0,i.redd.it,True,8gnet9,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,,2.0,13,pareidolia,I found a face in my sheet music too!,0.84,0,2,2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define the path and filename in your Google Drive
save_path = '/content/drive/MyDrive/image_embeddings_train_real.npy'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import requests
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from torchvision import transforms
from google.colab import drive

# --- 1. Model and Preprocessing Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
feature_extractor.to(device)
feature_extractor.eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- 2. Processing Function ---
def download_and_preprocess_image(data):
    index, url = data
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert('RGB')
        tensor = preprocess(img)
        return index, tensor
    except Exception as e:
        return index, None

# --- 3. Main Feature Extraction Loop ---
urls = df_train['image_url'].tolist()
url_data = list(zip(df_train.index, urls))
batch_size = 64
all_features = []
successful_indices = []

with ThreadPoolExecutor(max_workers=16) as executor:
    results_gen = executor.map(download_and_preprocess_image, url_data)
    batch = []
    for index, tensor in tqdm(results_gen, total=len(urls), desc="Processing images"):
        if tensor is not None:
            successful_indices.append(index)
            batch.append(tensor)

        if len(batch) == batch_size:
            input_batch = torch.stack(batch).to(device)
            with torch.no_grad():
                features = feature_extractor(input_batch)
            all_features.append(features.squeeze().cpu().numpy())
            batch = []

    if len(batch) > 0:
        input_batch = torch.stack(batch).to(device)
        with torch.no_grad():
            features = feature_extractor(input_batch)
        all_features.append(features.squeeze().cpu().numpy())

# --- 4. Final Alignment, Verification, and Saving ---
X_img = np.vstack(all_features)
df_train_clean = df_train.loc[successful_indices]

print("\n--- Verification ---")
print(f"Original DataFrame shape: {df_train.shape}")
print(f"Cleaned DataFrame shape:  {df_train_clean.shape}")
print(f"Image features shape:     {X_img.shape}")

# Mount drive, define path, and save

np.save(save_path, X_img)

print(f"\n✅ Embeddings saved successfully to {save_path}")
print(f"Shape of saved array: {X_img.shape}")

Using device: cuda


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
Processing images: 100%|██████████| 205000/205000 [14:08<00:00, 241.67it/s]



--- Verification ---
Original DataFrame shape: (205000, 16)
Cleaned DataFrame shape:  (58487, 16)
Image features shape:     (58487, 512)

✅ Embeddings saved successfully to /content/drive/MyDrive/image_embeddings_train_real.npy
Shape of saved array: (58487, 512)


In [None]:
import torch
import requests
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from torchvision import transforms
from google.colab import drive

# --- 1. Model and Preprocessing Setup (Run this cell once) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
feature_extractor.to(device)
feature_extractor.eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- 2. The Reusable Feature Extraction Function (Run this cell once) ---

# Helper function now with a timeout parameter
def _download_and_preprocess_image(data, timeout):
    index, url = data
    try:
        # The timeout is now passed into the request
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert('RGB')
        tensor = preprocess(img)
        return index, tensor
    except Exception as e:
        return index, None

def extract_image_features(df, timeout_seconds=20):
    """
    Processes a DataFrame to extract image features from a URL column.

    Args:
        df (pd.DataFrame): The DataFrame to process (e.g., df_train or df_test).
                           It must have a column named 'image_url'.
        timeout_seconds (int): The number of seconds to wait for a server response.

    Returns:
        tuple: A tuple containing:
            - df_clean (pd.DataFrame): The filtered DataFrame with only successful rows.
            - X_img (np.ndarray): The NumPy array of corresponding image features.
    """
    print(f"\nProcessing {len(df)} URLs with a {timeout_seconds}-second timeout...")

    urls = df['image_url'].tolist()
    url_data = list(zip(df.index, urls))

    batch_size = 64
    all_features = []
    successful_indices = []

    # We use a lambda to pass the timeout argument to our helper function
    func = lambda data: _download_and_preprocess_image(data, timeout=timeout_seconds)

    with ThreadPoolExecutor(max_workers=16) as executor:
        results_gen = executor.map(func, url_data)
        batch = []
        for index, tensor in tqdm(results_gen, total=len(urls), desc="Processing images"):
            if tensor is not None:
                successful_indices.append(index)
                batch.append(tensor)

            if len(batch) == batch_size:
                input_batch = torch.stack(batch).to(device)
                with torch.no_grad():
                    features = feature_extractor(input_batch)
                all_features.append(features.squeeze().cpu().numpy())
                batch = []

        if len(batch) > 0:
            input_batch = torch.stack(batch).to(device)
            with torch.no_grad():
                features = feature_extractor(input_batch)
            all_features.append(features.squeeze().cpu().numpy())

    if not all_features:
        print("Warning: No images were processed successfully.")
        return pd.DataFrame(), np.array([])

    X_img = np.vstack(all_features)
    df_clean = df.loc[successful_indices]

    print("\n--- Verification ---")
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Cleaned DataFrame shape:  {df_clean.shape}")
    print(f"Image features shape:     {X_img.shape}")

    return df_clean, X_img


# --- 3. How to Use for Both df_train and df_test ---

# Process the training data
df_train_clean, X_img_train = extract_image_features(df_train, timeout_seconds=20)

# Process the testing data
df_test_clean, X_img_test = extract_image_features(df_test, timeout_seconds=20)


# --- 4. Save the Results ---
drive.mount('/content/drive')

# Save training results
np.save('/content/drive/MyDrive/X_img_train.npy', X_img_train)
df_train_clean.to_csv('/content/drive/MyDrive/df_train_clean.csv', index=False)
print("\n✅ Training data and features saved successfully.")

# Save testing results
np.save('/content/drive/MyDrive/X_img_embeddings_test.npy', X_img_test)
df_test_clean.to_csv('/content/drive/MyDrive/df_test_clean.csv', index=False)
print("✅ Testing data and features saved successfully.")

Using device: cuda

Processing 59319 URLs with a 20-second timeout...


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0
Processing images: 100%|██████████| 59319/59319 [04:56<00:00, 199.84it/s]



--- Verification ---
Original DataFrame shape: (59319, 16)
Cleaned DataFrame shape:  (20199, 16)
Image features shape:     (20199, 512)
✅ Testing data and features saved successfully.


In [None]:
df_train_half = pd.read_csv('/content/drive/MyDrive/multimodal_train.tsv', delimiter='\t')

In [None]:
df_train_half = df_train_half.iloc[205001:,]

In [None]:
df_train_half.shape

(358999, 16)

In [None]:
import torch
import requests
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from torchvision import transforms
from google.colab import drive

# --- 1. Model and Preprocessing Setup (Run this cell once) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
feature_extractor.to(device)
feature_extractor.eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- 2. The Reusable Feature Extraction Function (Run this cell once) ---

# Helper function now with a timeout parameter
def _download_and_preprocess_image(data, timeout):
    index, url = data
    try:
        # The timeout is now passed into the request
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert('RGB')
        tensor = preprocess(img)
        return index, tensor
    except Exception as e:
        return index, None

def extract_image_features(df, timeout_seconds=20):
    """
    Processes a DataFrame to extract image features from a URL column.

    Args:
        df (pd.DataFrame): The DataFrame to process (e.g., df_train or df_test).
                           It must have a column named 'image_url'.
        timeout_seconds (int): The number of seconds to wait for a server response.

    Returns:
        tuple: A tuple containing:
            - df_clean (pd.DataFrame): The filtered DataFrame with only successful rows.
            - X_img (np.ndarray): The NumPy array of corresponding image features.
    """
    print(f"\nProcessing {len(df)} URLs with a {timeout_seconds}-second timeout...")

    urls = df['image_url'].tolist()
    url_data = list(zip(df.index, urls))

    batch_size = 64
    all_features = []
    successful_indices = []

    # We use a lambda to pass the timeout argument to our helper function
    func = lambda data: _download_and_preprocess_image(data, timeout=timeout_seconds)

    with ThreadPoolExecutor(max_workers=16) as executor:
        results_gen = executor.map(func, url_data)
        batch = []
        for index, tensor in tqdm(results_gen, total=len(urls), desc="Processing images"):
            if tensor is not None:
                successful_indices.append(index)
                batch.append(tensor)

            if len(batch) == batch_size:
                input_batch = torch.stack(batch).to(device)
                with torch.no_grad():
                    features = feature_extractor(input_batch)
                all_features.append(features.squeeze().cpu().numpy())
                batch = []

        if len(batch) > 0:
            input_batch = torch.stack(batch).to(device)
            with torch.no_grad():
                features = feature_extractor(input_batch)
            all_features.append(features.squeeze().cpu().numpy())

    if not all_features:
        print("Warning: No images were processed successfully.")
        return pd.DataFrame(), np.array([])

    X_img = np.vstack(all_features)
    df_clean = df.loc[successful_indices]

    print("\n--- Verification ---")
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Cleaned DataFrame shape:  {df_clean.shape}")
    print(f"Image features shape:     {X_img.shape}")

    return df_clean, X_img



# Process the training data
df_train__half_clean, X_img_half_train = extract_image_features(df_train_half, timeout_seconds=15)


# Save training results
np.save('/content/drive/MyDrive/X_img_half_train.npy', X_img_half_train)
# df_train_clean.to_csv('/content/drive/MyDrive/df_train_clean.csv', index=False)
print("\n✅ Training data and features saved successfully.")

Using device: cuda


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0



Processing 358999 URLs with a 15-second timeout...


Processing images: 100%|██████████| 358999/358999 [26:56<00:00, 222.11it/s]



--- Verification ---
Original DataFrame shape: (358999, 16)
Cleaned DataFrame shape:  (101342, 16)
Image features shape:     (101342, 512)

✅ Training data and features saved successfully.


In [None]:
# Merge the two DataFrames into one
df_train_full = pd.concat([df_train_clean, df_train__half_clean], ignore_index=True)

# Vertically stack the two NumPy arrays
X_img_full = np.vstack((X_img, X_img_half_train))

In [None]:
df_train_full.shape

(159829, 16)

In [None]:
X_img_full.shape

(159829, 512)

In [None]:
import os
import pandas as pd
import numpy as np
from google.colab import drive

# 1. Mount Google Drive
# This will prompt you for authorization.
drive.mount('/content/drive')

# 2. Define the path for your new folder
folder_path = '/content/drive/MyDrive/multimodel_dataset_extracted'

# 3. Create the folder
# The 'exist_ok=True' argument prevents an error if the folder already exists.
os.makedirs(folder_path, exist_ok=True)
print(f"Folder '{folder_path}' is ready.")

# 4. Define the full paths for your files
df_save_path = os.path.join(folder_path, 'df_train_full.csv')
x_img_save_path = os.path.join(folder_path, 'X_img_full.npy')

# 5. Save the DataFrame and NumPy array
# Assuming 'df_train_full' and 'X_img_full' are your variables
df_train_full.to_csv(df_save_path, index=False)
np.save(x_img_save_path, X_img_full)

print(f"\n✅ DataFrame saved successfully to: {df_save_path}")
print(f"✅ NumPy array saved successfully to: {x_img_save_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder '/content/drive/MyDrive/multimodel_dataset_extracted' is ready.

✅ DataFrame saved successfully to: /content/drive/MyDrive/multimodel_dataset_extracted/df_train_full.csv
✅ NumPy array saved successfully to: /content/drive/MyDrive/multimodel_dataset_extracted/X_img_full.npy


In [None]:
df_test_save_path = os.path.join(folder_path, 'df_test_clean.csv')
x_img_test_save_path = os.path.join(folder_path, 'X_img_test.npy')

# 4. Save the test DataFrame and NumPy array
df_test_clean.to_csv(df_test_save_path, index=False)
np.save(x_img_test_save_path, X_img_test)

print(f"✅ Test DataFrame saved successfully to: {df_test_save_path}")
print(f"✅ Test NumPy array saved successfully to: {x_img_test_save_path}")

✅ Test DataFrame saved successfully to: /content/drive/MyDrive/multimodel_dataset_extracted/df_test_clean.csv
✅ Test NumPy array saved successfully to: /content/drive/MyDrive/multimodel_dataset_extracted/X_img_test.npy


In [None]:
mock_dftest = pd.read_csv('/content/drive/MyDrive/multimodel_dataset_extracted/df_train_full.csv')

In [None]:
mock_dftest.shape

(159829, 16)

In [None]:
import numpy as np
from google.colab import drive

# 1. Mount your Google Drive
# drive.mount('/content/drive')

# 2. Define the path to your file
file_path = '/content/drive/MyDrive/multimodel_dataset_extracted/X_img_test.npy'

# 3. Load the NumPy array
X_img_full = np.load(file_path)

# 4. Print the shape to verify it's loaded correctly
print("✅ Array loaded successfully!")
print(f"Shape of the array: {X_img_full.shape}")

✅ Array loaded successfully!
Shape of the array: (20199, 512)
