In [1]:
import torch
import numpy as np
import pandas as pd
import easyocr
from PIL import Image 
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from transformers import CLIPProcessor, CLIPModel, AutoModelForSeq2SeqLM, AutoTokenizer
from multilingual_clip import pt_multilingual_clip
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

from langdetect import detect, DetectorFactory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available, otherwise fallback to CPU
print(device)

cuda


In [3]:
# Load pre-trained CLIP model and processor
model_name = "M-CLIP/XLM-Roberta-Large-Vit-B-32"
clip_text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
clip_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

clip_model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(clip_model_name).eval()
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)



# Function definitions

In [4]:
def get_text_features(text):
    """Extracts text features using M-CLIP/XLM-Roberta-Large-Vit-B-32 with robust tokenization and truncation."""
    try:
        # Tokenize the text to check the number of tokens
        tokenized_text = clip_tokenizer(text, return_tensors="pt")
        
        # Check if the number of tokens exceeds the model's limit of 514
        if len(tokenized_text["input_ids"][0]) > 514:
            print(f"⚠️ Text exceeds 514 tokens. Truncating: {text[:100]}...")  # Log original text (first 100 chars)
            
            # Truncate the raw text to the max length by decoding the first 514 tokens and re-encoding
            truncated_tokens = tokenized_text["input_ids"][0][:513]
            truncated_text = clip_tokenizer.decode(truncated_tokens, skip_special_tokens=True)

            # Double check that the truncated text isn't empty
            if not truncated_text:
                truncated_text = text[:100]  # Just keep the first 100 characters if truncation fails
            print(f"⚠️ Text truncated to: {truncated_text[:100]}...")  # Log truncated text (first 100 chars)
        else:
            truncated_text = text  # No truncation needed, keep the original text
        
        # Pass the (possibly truncated) raw text to the model for feature extraction
        text_features = clip_text_model.forward([truncated_text], clip_tokenizer)
        
        return text_features.squeeze().detach().cpu().numpy()

    except Exception as e:
        print(f"⚠️ Error extracting text features: {e}")
        return None  # Return None for failed cases


In [5]:
# def get_text_features(text):
#     """Extracts text features using M-CLIP/XLM-Roberta-Large-Vit-B-32."""
#     try:
#         text_features = clip_text_model.forward([text], clip_tokenizer)
#         return text_features.squeeze().detach().cpu().numpy()  # Convert tensor to numpy array
#     except Exception as e:
#         print(f"⚠️ Error extracting text features: {e}")
#         return None

In [6]:
# EasyOCR for OCR
reader = easyocr.Reader(["no", "en"])  # Norwegian and English, because there's some mixing of the languages

def extract_text_easyocr(image_path):
    """Extracts text from an image using EasyOCR."""
    try:
        results = reader.readtext(image_path, detail=0)  # Extract text without coordinates
        return " ".join(results)  # Join extracted words into a single string
    except Exception as e:
        print(f"⚠️ OCR failed for {image_path}: {e}")
        return ""

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [7]:
# Function to extract image features
def get_image_features(image_path):
    """Extracts image embeddings from CLIP."""
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)
    return image_features.cpu().numpy()

# Feature extraction

In [8]:
# loading the data
df = pd.read_excel("E:/sample_images_merged.xlsx")
# Fix the labels, 0=non humor, 1=humor
df["label"] = df["label"].apply(lambda x: 1 if x > 0 else 0)
df["image_path_image_filename_image_1"] = df["image_path_image_filename_image_1"].str.replace(r"^D:/", "E:/", regex=True)

In [9]:
# Extract text from images
import PIL
PIL.Image.ANTIALIAS = PIL.Image.LANCZOS
df["image_text"] = df["image_path_image_filename_image_1"].apply(extract_text_easyocr)



In [10]:
print((df["image_text"] == "").sum())

768


In [11]:
df_no_text = df[df["image_text"] == ""]

# Select a few random examples (change `n` as needed)
print(df_no_text["image_path_image_filename_image_1"].sample(n=5, random_state=42))

3251    E:/Images/Covid/images_2/ElaqQpdXIAU4K2h.png
1401    E:/Images/Covid/images_2/EiWo7-GWkAAVzPE.jpg
3066    E:/Images/Covid/images_1/EYSssVQWsAUBPHl.jpg
3355    E:/Images/Covid/images_2/EsKqTHrXYAEH64r.jpg
2131    E:/Images/Covid/images_1/E6r5ml_XoAYkwZp.jpg
Name: image_path_image_filename_image_1, dtype: object


In [12]:
DetectorFactory.seed = 0

def filter_image_text_by_language(text):
    """Filter out texts that are neither Norwegian nor English."""
    try:
        # Detect the language of the text
        lang = detect(text)
        # Only keep Norwegian (no) or English (en) texts
        if lang not in ['no', 'en']:
            return ""  # Return empty string if text is not in Norwegian or English
        else:
            return text  # Return original text if it's Norwegian or English
    except Exception as e:
        return ""  # Return empty string for errors

df["filtered_image_text"] = df["image_text"].apply(filter_image_text_by_language)

In [13]:
# Check how many texts were filtered out (replaced with empty string)
print(df["filtered_image_text"].apply(lambda x: x == "").sum())

1708


In [16]:
print(df[df["filtered_image_text"] == ""]["image_text"].head(20))

6                                              1o 12 20
7            S MI TTETILFELLER KLAS S ETRINN KARA NTENE
8                                                   hnn
10    ioooe -charts com Greece Antal døde opgjort pr...
13                                                     
16                                                     
17                                                (eo7n
19                                                  7al
22                 ZIONS ANI JaZZ CORONAVIRUS CHALLENGE
23                                                     
25                                                IIF9K
29    Kan BIBELFORSKNINGEN OG Kir k e n s f 0 r ky n...
32                                                     
34                                Gymleco 20 02 69 0316
36                                                IIF9K
37                                                     
38                                                IIF9K
40                                              

In [17]:
df["image_text_features"] = df["filtered_image_text"].apply(get_text_features)

Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors


⚠️ Text exceeds 514 tokens. Truncating: ENTSO-E TYNDP 2022 System Needs Study July 2022 version for public consultation M5 nnl Additional ca...
⚠️ Text truncated to: ENTSO-E TYNDP 2022 System Needs Study July 2022 version for public consultation M5 nnl Additional ca...
⚠️ Error extracting text features: index out of range in self
⚠️ Text exceeds 514 tokens. Truncating: Jonas Stein Publisert av Jonas Stein 31 min POLITISK DRAMA ORDENTLIG TROMSØ-STIL Mens resten av det ...
⚠️ Text truncated to: Jonas Stein Publisert av Jonas Stein 31 min POLITISK DRAMA ORDENTLIG TROMSØ-STIL Mens resten av det ...
⚠️ Text exceeds 514 tokens. Truncating: Telia N 11:41 4 38 % Lukk NYTTLUKSUSMERKE TIL NORGE Skal åpne Chanel- butikk i Oslo 5 GUCo Slår opp ...
⚠️ Text truncated to: Telia N 11:41 4 38 % Lukk NYTTLUKSUSMERKE TIL NORGE Skal åpne Chanel- butikk i Oslo 5 GUCo Slår opp ...
⚠️ Error extracting text features: index out of range in self
⚠️ Text exceeds 514 tokens. Truncating: thebmj   Visual summary Co

In [21]:
# Find rows where image_text_features is NA
failed_rows = df[df["image_text_features"].isna()]

# Inspect the filtered_image_text for these rows
for index, row in failed_rows.iterrows():
    print(f"Observation {index}:")
    print(f"Original Text (filtered_image_text): {row['filtered_image_text']}")
    print({row['label']})
    print("=" * 50)  # Separator for readability

Observation 153:
Original Text (filtered_image_text): ENTSO-E TYNDP 2022 System Needs Study July 2022 version for public consultation M5 nnl Additional capacity increases and flexibility assets by 2040 bring economic benefits and support security of supply By 2040,88 GW of additional cross-border capacity 20 Bn € /year for storage and 0.1 Bn €/year for peaking increases with respect to 2025 grid, 41 GW of units) The increase socio-economic welfare storage and GW of CO,-free peaking units would amounts to billion euro/year until 2040. Addressing be needed to support Europes move towards needs would also improve security of electricity supply carbon-free power system and ensure continuous as energy-not-served would be cut by 1.72 TWh/year. and cost-effective access to ectricity. With regards This in turn would increase socio-economic welfare cross-border capacity this represents 24 GW even further of additional cross-border capacity increases on In 2040 the highest needs for capacity inc

In [22]:
# Create the different feature sets
df["text_features"] = df["tweet"].apply(get_text_features)
# df["image_text_features"] = df["filtered_image_text"].apply(get_text_features)
df["image_features"] = df["image_path_image_filename_image_1"].apply(get_image_features)



In [23]:
print(df["text_features"].shape)
print(df["image_text_features"].shape)
print(df["image_features"].shape)

(3937,)
(3937,)
(3937,)


In [24]:
print(df.isnull().sum())

tweet                                   0
label                                   0
id                                      0
date                                    1
image_path_image_filename_image_1       0
image_path_image_filename_image_2    3640
image_path_image_filename_image_3    3815
image_path_image_filename_image_4    3859
image_text                              0
filtered_image_text                     0
image_text_features                    32
text_features                           0
image_features                          0
dtype: int64


In [84]:
# Find a representative feature vector from rows where "image_text" is an empty string
empty_vector_example = df.loc[df["image_text"] == "", "image_text_features"].dropna().values

if len(empty_vector_example) > 0:
    empty_vector_example = empty_vector_example[0]  # Take the first available vector
    # Assign the vector to all NAs in "image_text_features"
    df.loc[df["image_text_features"].isna(), "image_text_features"] = df.loc[df["image_text_features"].isna(), "image_text_features"].apply(lambda _: empty_vector_example)

In [25]:
print(df["image_text_features"].head(10))

0    [0.0070521524, 0.09996615, 0.06716276, -0.1244...
1    [-0.0023984693, 0.08590933, -0.054364007, 0.12...
2    [0.26396376, -0.04853261, -0.06528268, -0.1353...
3    [-0.01885866, -0.05700396, -0.13053499, -0.168...
4    [-0.0109979035, 0.032760814, -0.24700762, -0.0...
5    [-0.017980304, 0.034670442, -0.059337832, 0.00...
6    [0.05160245, 0.02143021, -0.057980604, 0.03006...
7    [0.05160245, 0.02143021, -0.057980604, 0.03006...
8    [0.05160245, 0.02143021, -0.057980604, 0.03006...
9    [0.21786541, -0.005461399, -0.16227847, -0.005...
Name: image_text_features, dtype: object


In [86]:
print(df.isnull().sum())

tweet                                   0
label                                   0
id                                      0
date                                    0
image_path_image_filename_image_1       0
image_path_image_filename_image_2    2018
image_path_image_filename_image_3    2108
image_path_image_filename_image_4    2132
image_text                              0
text_features                           0
image_text_features                     0
image_features                          0
intra_text_euclidean                    0
filtered_image_text                     0
dtype: int64


In [26]:
df.to_pickle("E:/covid_features_v2.pkl")
# df = pd.read_pickle("E:/covid_features.pkl")

## Intramodal as Euclidean distance

In [18]:
# Euclidean distance for intramodal incongruity:
from scipy.spatial.distance import pdist, squareform

def mean_pairwise_distance(vector):
    distances = pdist(vector.reshape(-1, 1), metric='euclidean')  # Compute all pairwise distances
    return np.mean(distances)

df["intra_text_euclidean"] = df["text_features"].apply(mean_pairwise_distance)
df["intra_image_text_euclidean"] = df["image_text_features"].apply(mean_pairwise_distance)
df["intra_image_euclidean"] = df["image_features"].apply(mean_pairwise_distance)

AttributeError: 'NoneType' object has no attribute 'reshape'

## Intermodal as cosine dissimilarity

In [None]:
from scipy.spatial.distance import cosine

def inter_modal_cosine_dissimilarity(first_modality, second_modality):
    return 1 - cosine(first_modality, second_modality)

df["inter_t_i_cosine"] = df.apply(lambda row: inter_modal_cosine_dissimilarity(row["text_features"], row["image_features"]), axis=1)
df["inter_t_ti_cosine"] = df.apply(lambda row: inter_modal_cosine_dissimilarity(row["text_features"], row["image_text_features"]), axis=1)
df["inter_i_it_cosine"] = df.apply(lambda row: inter_modal_cosine_dissimilarity(row["image_features"], row["image_text_features"]), axis=1)