In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
 !pip install clip-anytorch

Collecting clip-anytorch
  Downloading clip_anytorch-2.6.0-py3-none-any.whl.metadata (8.4 kB)
Collecting ftfy (from clip-anytorch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading clip_anytorch-2.6.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, clip-anytorch
Successfully installed clip-anytorch-2.6.0 ftfy-6.3.1


In [None]:
!ls /content/drive/MyDrive/


 clip_embeddings_multimodal
 Code
'Colab Notebooks'
'Copy of bertmodelpreview.ipynb'
'Fake News Detection: LAIR Dataset Project (1).gdoc'
'Fake News Detection: LAIR Dataset Project.gdoc'
'Fake News Detection Project Guidance.gdoc'
 LibraryManagement
 lightgbm_model.pkl
 ML-Project
 results
 results_local


In [None]:
!ls /content/drive/MyDrive/clip_embeddings_multimodal/


In [None]:
# --- Imports ---
import os
import torch
import requests
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import clip
from google.colab import drive

# --- 1. Mount Google Drive ---
drive.mount('/content/drive', force_remount=True)

# --- 2. Paths ---
folder_path = '/content/drive/MyDrive/clip_embeddings_multimodal'
os.makedirs(folder_path, exist_ok=True)

train_path = '/content/drive/MyDrive/ML-Project/multimodal_train.tsv'
test_path = '/content/drive/MyDrive/ML-Project/multimodal_test_public.tsv'

# --- 3. Load Datasets ---
df_train = pd.read_csv(train_path, delimiter='\t')
df_test = pd.read_csv(test_path, delimiter='\t')

print(f"✅ Train shape: {df_train.shape}")
print(f"✅ Test shape:  {df_test.shape}")

# Split train dataset into 2 halves
mid_idx = len(df_train) // 2
df_train_half_1 = df_train.iloc[:mid_idx].reset_index(drop=True)
df_train_half_2 = df_train.iloc[mid_idx:].reset_index(drop=True)
print(f"✅ Split into halves: {len(df_train_half_1)} + {len(df_train_half_2)}")

# --- 4. Setup CLIP ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

# --- 5. Helper: download & preprocess ---
def _download_and_preprocess_image(data, timeout):
    index, url = data
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert('RGB')
        tensor = preprocess(img)
        return index, tensor
    except Exception:
        return index, None

# --- 6. Main feature extraction function ---
def extract_image_features(df, timeout_seconds=20):
    print(f"\nProcessing {len(df)} URLs with a {timeout_seconds}-second timeout...")

    urls = df['image_url'].tolist()
    url_data = list(zip(df.index, urls))

    batch_size = 64
    all_features = []
    successful_indices = []

    func = lambda data: _download_and_preprocess_image(data, timeout=timeout_seconds)

    with ThreadPoolExecutor(max_workers=16) as executor:
        results_gen = executor.map(func, url_data)
        batch = []
        for index, tensor in tqdm(results_gen, total=len(urls), desc="Processing images"):
            if tensor is not None:
                successful_indices.append(index)
                batch.append(tensor)

            if len(batch) == batch_size:
                input_batch = torch.stack(batch).to(device)
                with torch.no_grad():
                    features = model.encode_image(input_batch)
                    features = features / features.norm(dim=-1, keepdim=True)
                all_features.append(features.cpu().numpy())
                batch = []

        # Process remainder
        if len(batch) > 0:
            input_batch = torch.stack(batch).to(device)
            with torch.no_grad():
                features = model.encode_image(input_batch)
                features = features / features.norm(dim=-1, keepdim=True)
            all_features.append(features.cpu().numpy())

    if not all_features:
        print("⚠️ No images processed successfully.")
        return pd.DataFrame(), np.array([])

    X_img = np.vstack(all_features)
    df_clean = df.loc[successful_indices].reset_index(drop=True)

    print("\n--- Verification ---")
    print(f"Original DataFrame shape: {df.shape}")
    print(f"Cleaned DataFrame shape:  {df_clean.shape}")
    print(f"Image features shape:     {X_img.shape}")

    return df_clean, X_img

# --- 7. Process & Save ---

# 7.1 Train Half 1
df_train_half1_clean, X_img_half1 = extract_image_features(df_train_half_1, timeout_seconds=15)
np.save(os.path.join(folder_path, "X_img_train_half1.npy"), X_img_half1)
df_train_half1_clean.to_csv(os.path.join(folder_path, "df_train_half1_clean.csv"), index=False)
print("\n✅ Saved Train Half 1 embeddings and cleaned CSV.")

# 7.2 Train Half 2
df_train_half2_clean, X_img_half2 = extract_image_features(df_train_half_2, timeout_seconds=15)
np.save(os.path.join(folder_path, "X_img_train_half2.npy"), X_img_half2)
df_train_half2_clean.to_csv(os.path.join(folder_path, "df_train_half2_clean.csv"), index=False)
print("\n✅ Saved Train Half 2 embeddings and cleaned CSV.")

# 7.3 Test Set
df_test_clean, X_img_test = extract_image_features(df_test, timeout_seconds=15)
np.save(os.path.join(folder_path, "X_img_test.npy"), X_img_test)
df_test_clean.to_csv(os.path.join(folder_path, "df_test_clean.csv"), index=False)
print("\n✅ Saved Test embeddings and cleaned CSV.")

print("\n🎉 All CLIP image embeddings processed and saved successfully!")


Mounted at /content/drive
✅ Train shape: (564000, 16)
✅ Test shape:  (59319, 16)
✅ Split into halves: 282000 + 282000
Using device: cuda


100%|███████████████████████████████████████| 354M/354M [00:04<00:00, 72.2MiB/s]



Processing 282000 URLs with a 15-second timeout...


Processing images: 100%|██████████| 282000/282000 [23:08<00:00, 203.15it/s] 



--- Verification ---
Original DataFrame shape: (282000, 16)
Cleaned DataFrame shape:  (81705, 16)
Image features shape:     (81705, 512)

✅ Saved Train Half 1 embeddings and cleaned CSV.

Processing 282000 URLs with a 15-second timeout...


Processing images: 100%|██████████| 282000/282000 [21:58<00:00, 213.82it/s]



--- Verification ---
Original DataFrame shape: (282000, 16)
Cleaned DataFrame shape:  (78481, 16)
Image features shape:     (78481, 512)

✅ Saved Train Half 2 embeddings and cleaned CSV.

Processing 59319 URLs with a 15-second timeout...


Processing images: 100%|██████████| 59319/59319 [04:58<00:00, 198.84it/s]



--- Verification ---
Original DataFrame shape: (59319, 16)
Cleaned DataFrame shape:  (16696, 16)
Image features shape:     (16696, 512)

✅ Saved Test embeddings and cleaned CSV.

🎉 All CLIP image embeddings processed and saved successfully!


In [None]:
# --- Imports ---
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import clip
from google.colab import drive

# --- 1. Mount Google Drive ---
drive.mount('/content/drive', force_remount=True)

# --- 2. Paths ---
folder_path = '/content/drive/MyDrive/clip_embeddings_multimodal'
os.makedirs(folder_path, exist_ok=True)

# --- 3. Load cleaned DataFrames (from image embedding extraction) ---
df_train_half1_clean = pd.read_csv(os.path.join(folder_path, "/content/drive/MyDrive/clip_embeddings_multimodal/df_train_half1_clean.csv"))
df_train_half2_clean = pd.read_csv(os.path.join(folder_path, "/content/drive/MyDrive/clip_embeddings_multimodal/df_train_half2_clean.csv"))
df_test_clean = pd.read_csv(os.path.join(folder_path, "/content/drive/MyDrive/clip_embeddings_multimodal/df_test_clean.csv"))

# Combine both train halves
df_train_clean = pd.concat([df_train_half1_clean, df_train_half2_clean], ignore_index=True)
print("✅ Loaded and combined cleaned DataFrames corresponding to image embeddings:")
print(f"Train combined shape: {df_train_clean.shape}")
print(f"Test shape:           {df_test_clean.shape}")

# --- 4. Setup CLIP ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

# --- 5. Text Embedding Function ---
def extract_text_features(df, text_column, batch_size=64):
    """
    Extracts CLIP text embeddings for a given column in a DataFrame.
    """
    print(f"\nExtracting text embeddings from column: '{text_column}' ({len(df)} rows)")
    texts = df[text_column].astype(str).tolist()
    all_features = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding text"):
        batch_texts = texts[i:i+batch_size]
        tokens = clip.tokenize(batch_texts, truncate=True).to(device)
        with torch.no_grad():
            text_features = model.encode_text(tokens)
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        all_features.append(text_features.cpu().numpy())

    X_txt = np.vstack(all_features)
    print(f"✅ Extracted text feature shape: {X_txt.shape}")
    return df, X_txt

# --- 6. Process & Save ---

# 🟦 Training (combined halves)
df_train_clean, X_txt_train = extract_text_features(df_train_clean, text_column='clean_title')
np.save(os.path.join(folder_path, "X_txt_train.npy"), X_txt_train)
df_train_clean.to_csv(os.path.join(folder_path, "df_txt_train_clean.csv"), index=False)
print("\n✅ Saved combined training text embeddings and cleaned CSV.")

# 🟨 Test set
df_test_clean, X_txt_test = extract_text_features(df_test_clean, text_column='clean_title')
np.save(os.path.join(folder_path, "X_txt_test.npy"), X_txt_test)
df_test_clean.to_csv(os.path.join(folder_path, "df_txt_test_clean.csv"), index=False)
print("\n✅ Saved test text embeddings and cleaned CSV.")

print("\n🎉 All CLIP text embeddings extracted successfully for image-matched rows!")


Mounted at /content/drive
✅ Loaded and combined cleaned DataFrames corresponding to image embeddings:
Train combined shape: (160186, 16)
Test shape:           (16696, 16)
Using device: cuda

Extracting text embeddings from column: 'clean_title' (160186 rows)


Encoding text: 100%|██████████| 2503/2503 [01:51<00:00, 22.53it/s]


✅ Extracted text feature shape: (160186, 512)

✅ Saved combined training text embeddings and cleaned CSV.

Extracting text embeddings from column: 'clean_title' (16696 rows)


Encoding text: 100%|██████████| 261/261 [00:11<00:00, 22.03it/s]


✅ Extracted text feature shape: (16696, 512)

✅ Saved test text embeddings and cleaned CSV.

🎉 All CLIP text embeddings extracted successfully for image-matched rows!


In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import drive

# --- 1. Mount Google Drive ---
drive.mount('/content/drive', force_remount=True)

# --- 2. Paths ---
folder_path = '/content/drive/MyDrive/clip_embeddings_multimodal'

# --- 3. Load image halves ---
img_half1_path = os.path.join(folder_path, "X_img_train_half1.npy")
img_half2_path = os.path.join(folder_path, "X_img_train_half2.npy")
df_half1_path = os.path.join(folder_path, "df_train_half1_clean.csv")
df_half2_path = os.path.join(folder_path, "df_train_half2_clean.csv")

print("Loading halves...")
X_img_half1 = np.load(img_half1_path)
X_img_half2 = np.load(img_half2_path)
df_train_half1 = pd.read_csv(df_half1_path)
df_train_half2 = pd.read_csv(df_half2_path)

# --- 4. Combine halves ---
print("\nCombining...")
X_img_train = np.vstack([X_img_half1, X_img_half2])
df_train_clean = pd.concat([df_train_half1, df_train_half2], ignore_index=True)

# --- 5. Save combined files ---
np.save(os.path.join(folder_path, "X_img_train.npy"), X_img_train)
df_train_clean.to_csv(os.path.join(folder_path, "df_train_clean.csv"), index=False)

print("\n✅ Combined image embeddings and cleaned DataFrames saved successfully!")
print(f"Final image embedding shape: {X_img_train.shape}")
print(f"Final DataFrame shape:       {df_train_clean.shape}")

# --- 6. Optional: Verify alignment ---
assert len(df_train_clean) == len(X_img_train), "⚠️ Length mismatch between embeddings and DataFrame!"
print("\n✅ Alignment verified: same number of rows in DataFrame and embeddings.")


Mounted at /content/drive
Loading halves...

Combining...

✅ Combined image embeddings and cleaned DataFrames saved successfully!
Final image embedding shape: (160186, 512)
Final DataFrame shape:       (160186, 16)

✅ Alignment verified: same number of rows in DataFrame and embeddings.


In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import drive

# --- 1. Mount Drive ---
drive.mount('/content/drive', force_remount=True)

# --- 2. Paths ---
old_folder = '/content/drive/MyDrive/clip_embeddings_multimodal'
new_folder = '/content/drive/MyDrive/ML-Project/clip_embeddings_multimodal'
os.makedirs(new_folder, exist_ok=True)

# --- 3. Load and combine TRAIN image halves ---
print("📦 Loading and combining TRAIN image halves...")
X_img_half1 = np.load(os.path.join(old_folder, "X_img_train_half1.npy"))
X_img_half2 = np.load(os.path.join(old_folder, "X_img_train_half2.npy"))
X_img_train = np.vstack([X_img_half1, X_img_half2])

df_train_half1 = pd.read_csv(os.path.join(old_folder, "df_train_half1_clean.csv"))
df_train_half2 = pd.read_csv(os.path.join(old_folder, "df_train_half2_clean.csv"))
df_train_clean = pd.concat([df_train_half1, df_train_half2], ignore_index=True)

print(f"✅ Combined train image embeddings: {X_img_train.shape}")
print(f"✅ Combined train DataFrame: {df_train_clean.shape}")

# --- 4. Load TRAIN text embeddings ---
print("\n📦 Loading TRAIN text embeddings...")
txt_half1_path = os.path.join(old_folder, "X_txt_train_half1.npy")
txt_half2_path = os.path.join(old_folder, "X_txt_train_half2.npy")
txt_full_path = os.path.join(old_folder, "X_txt_train.npy")

if os.path.exists(txt_full_path):
    print("✅ Found single combined text embedding file.")
    X_txt_train = np.load(txt_full_path)
else:
    print("ℹ️ Half text embeddings found — combining...")
    X_txt_half1 = np.load(txt_half1_path)
    X_txt_half2 = np.load(txt_half2_path)
    X_txt_train = np.vstack([X_txt_half1, X_txt_half2])

# --- Sanity check ---
assert len(df_train_clean) == len(X_img_train) == len(X_txt_train), \
    "❌ Mismatch between training data lengths!"

# --- 5. Load TEST data ---
print("\n📦 Loading TEST data...")
X_img_test = np.load(os.path.join(old_folder, "X_img_test.npy"))
df_test_clean = pd.read_csv(os.path.join(old_folder, "df_test_clean.csv"))

txt_test_path = os.path.join(old_folder, "X_txt_test.npy")
X_txt_test = np.load(txt_test_path) if os.path.exists(txt_test_path) else None

assert len(df_test_clean) == len(X_img_test) == len(X_txt_test), \
    "❌ Mismatch between test data lengths!"

# --- 6. Save combined files to new directory ---
print("\n💾 Saving final files...")

np.save(os.path.join(new_folder, "X_img_train.npy"), X_img_train)
np.save(os.path.join(new_folder, "X_txt_train.npy"), X_txt_train)
df_train_clean.to_csv(os.path.join(new_folder, "df_train_clean.csv"), index=False)

np.save(os.path.join(new_folder, "X_img_test.npy"), X_img_test)
np.save(os.path.join(new_folder, "X_txt_test.npy"), X_txt_test)
df_test_clean.to_csv(os.path.join(new_folder, "df_test_clean.csv"), index=False)

print("\n✅ Final combined files saved successfully!")

print("\n--- Verification ---")
print(f"Train embeddings: image {X_img_train.shape}, text {X_txt_train.shape}")
print(f"Test embeddings:  image {X_img_test.shape}, text {X_txt_test.shape}")
print(f"Train DF: {df_train_clean.shape}")
print(f"Test DF:  {df_test_clean.shape}")

print("\n🎉 All consolidated embeddings and clean files saved in:")
print(new_folder)


Mounted at /content/drive
📦 Loading and combining TRAIN image halves...
✅ Combined train image embeddings: (160186, 512)
✅ Combined train DataFrame: (160186, 16)

📦 Loading TRAIN text embeddings...
✅ Found single combined text embedding file.

📦 Loading TEST data...

💾 Saving final files...

✅ Final combined files saved successfully!

--- Verification ---
Train embeddings: image (160186, 512), text (160186, 512)
Test embeddings:  image (16696, 512), text (16696, 512)
Train DF: (160186, 16)
Test DF:  (16696, 16)

🎉 All consolidated embeddings and clean files saved in:
/content/drive/MyDrive/ML-Project/clip_embeddings_multimodal
