In [94]:
import torch
import numpy as np
import pandas as pd
import easyocr
from PIL import Image 
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from transformers import CLIPProcessor, CLIPModel, AutoModelForSeq2SeqLM, AutoTokenizer
from multilingual_clip import pt_multilingual_clip
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

from langdetect import detect, DetectorFactory

In [95]:
device = "cuda" if torch.cuda.is_available() else "cpu"  # Use GPU if available, otherwise fallback to CPU
print(device)

cuda


In [96]:
# Load pre-trained CLIP model and processor
model_name = "M-CLIP/XLM-Roberta-Large-Vit-B-32"
clip_text_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
clip_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

clip_model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(clip_model_name).eval()
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)



# Function definitions

In [97]:
def get_text_features(text):
    """Extracts text features using M-CLIP/XLM-Roberta-Large-Vit-B-32 with robust tokenization and truncation."""
    try:
        # Tokenize the text to check the number of tokens
        tokenized_text = clip_tokenizer(text, return_tensors="pt")
        
        # Check if the number of tokens exceeds the model's limit of 514
        if len(tokenized_text["input_ids"][0]) > 514:
            print(f"⚠️ Text exceeds 514 tokens. Truncating: {text[:100]}...")  # Log original text (first 100 chars)
            
            # Truncate the raw text to the max length by decoding the first 514 tokens and re-encoding
            truncated_tokens = tokenized_text["input_ids"][0][:513]
            truncated_text = clip_tokenizer.decode(truncated_tokens, skip_special_tokens=True)

            # Double check that the truncated text isn't empty
            if not truncated_text:
                truncated_text = text[:100]  # Just keep the first 100 characters if truncation fails
            print(f"⚠️ Text truncated to: {truncated_text[:100]}...")  # Log truncated text (first 100 chars)
        else:
            truncated_text = text  # No truncation needed, keep the original text
        
        # Pass the (possibly truncated) raw text to the model for feature extraction
        text_features = clip_text_model.forward([truncated_text], clip_tokenizer)
        
        return text_features.squeeze().detach().cpu().numpy()

    except Exception as e:
        print(f"⚠️ Error extracting text features: {e}")
        return None  # Return None for failed cases


In [5]:
# def get_text_features(text):
#     """Extracts text features using M-CLIP/XLM-Roberta-Large-Vit-B-32."""
#     try:
#         text_features = clip_text_model.forward([text], clip_tokenizer)
#         return text_features.squeeze().detach().cpu().numpy()  # Convert tensor to numpy array
#     except Exception as e:
#         print(f"⚠️ Error extracting text features: {e}")
#         return None

In [98]:
# EasyOCR for OCR
reader = easyocr.Reader(["no", "en"])  # Norwegian and English, because there's some mixing of the languages

def extract_text_easyocr(image_path):
    """Extracts text from an image using EasyOCR."""
    try:
        results = reader.readtext(image_path, detail=0)  # Extract text without coordinates
        return " ".join(results)  # Join extracted words into a single string
    except Exception as e:
        print(f"⚠️ OCR failed for {image_path}: {e}")
        return ""

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [99]:
# Function to extract image features
def get_image_features(image_path):
    """Extracts image embeddings from CLIP."""
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)
    return image_features.cpu().numpy()

# Feature extraction

In [134]:
# loading the data
df = pd.read_excel("D:/Data/Datasets/Poli_reco_datasets/sample_images_merged.xlsx")
# Fix the labels, 0=non humor, 1=humor
df["label"] = df["label"].apply(lambda x: 1 if x > 0 else 0)
df["image_path_image_filename_image_1"] = df["image_path_image_filename_image_1"].str.replace(r"^D:/", "E:/", regex=True)

In [136]:
# Extract text from images
import PIL
PIL.Image.ANTIALIAS = PIL.Image.LANCZOS
df["image_text"] = df["image_path_image_filename_image_1"].apply(extract_text_easyocr)



In [137]:
print((df["image_text"] == "").sum())

768


In [None]:
df_no_text = df[df["image_text"] == ""]
print(df_no_text["image_path_image_filename_image_1"].sample(n=5, random_state=42))

3251    E:/Images/Covid/images_2/ElaqQpdXIAU4K2h.png
1401    E:/Images/Covid/images_2/EiWo7-GWkAAVzPE.jpg
3066    E:/Images/Covid/images_1/EYSssVQWsAUBPHl.jpg
3355    E:/Images/Covid/images_2/EsKqTHrXYAEH64r.jpg
2131    E:/Images/Covid/images_1/E6r5ml_XoAYkwZp.jpg
Name: image_path_image_filename_image_1, dtype: object


In [138]:
DetectorFactory.seed = 0

def filter_image_text_by_language(text):
    """Filter out texts that are neither Norwegian nor English."""
    try:
        lang = detect(text)
        if lang not in ['no', 'en']:
            return "" 
        else:
            return text 
    except Exception as e:
        return ""  # Return empty string for errors

df["filtered_image_text"] = df["image_text"].apply(filter_image_text_by_language)

In [139]:
# Check how many texts were filtered out (replaced with empty string)
print(df["filtered_image_text"].apply(lambda x: x == "").sum())

1708


In [140]:
print(df[df["filtered_image_text"] == ""]["image_text"].head(20))

6                                              1o 12 20
7            S MI TTETILFELLER KLAS S ETRINN KARA NTENE
8                                                   hnn
10    ioooe -charts com Greece Antal døde opgjort pr...
13                                                     
16                                                     
17                                                (eo7n
19                                                  7al
22                 ZIONS ANI JaZZ CORONAVIRUS CHALLENGE
23                                                     
25                                                IIF9K
29    Kan BIBELFORSKNINGEN OG Kir k e n s f 0 r ky n...
32                                                     
34                                Gymleco 20 02 69 0316
36                                                IIF9K
37                                                     
38                                                IIF9K
40                                              

The superimposed text has been preprocessed some, and is now stored in the column "processed_image_text"

In [None]:
# df = df.fillna("")
# print(df["processed_image_text"].head(20))

0     Diagramtittel 300 250 200 150 100 50 2005 2006...
1                                                      
2     Resett resettno 4 m 0 Nyhetsreporter møtte abs...
3                                                      
4     1432 4G AA stiklestadno stiklestad Spelet om H...
5     janthomasofficial Puerto Vallarta Jalisco Likt...
6                                                      
7                                                      
8                                                      
9                                                      
10                                                     
11    2019 2018 2017 Sum eiendeler Sum driftsinntekt...
12    Ikke særegne regler for Oslo Astrup mener regj...
13                                                     
14    å gjøre mot grusomheten 0g dens utrettelige vå...
15    4 1128 Av Einar Tho mars 2020 kl 0927 Mannen s...
16                                                     
17                                              

In [141]:
df["image_text_features"] = df["filtered_image_text"].apply(get_text_features)

⚠️ Text exceeds 514 tokens. Truncating: ENTSO-E TYNDP 2022 System Needs Study July 2022 version for public consultation M5 nnl Additional ca...
⚠️ Text truncated to: ENTSO-E TYNDP 2022 System Needs Study July 2022 version for public consultation M5 nnl Additional ca...
⚠️ Error extracting text features: index out of range in self
⚠️ Text exceeds 514 tokens. Truncating: Jonas Stein Publisert av Jonas Stein 31 min POLITISK DRAMA ORDENTLIG TROMSØ-STIL Mens resten av det ...
⚠️ Text truncated to: Jonas Stein Publisert av Jonas Stein 31 min POLITISK DRAMA ORDENTLIG TROMSØ-STIL Mens resten av det ...
⚠️ Text exceeds 514 tokens. Truncating: Telia N 11:41 4 38 % Lukk NYTTLUKSUSMERKE TIL NORGE Skal åpne Chanel- butikk i Oslo 5 GUCo Slår opp ...
⚠️ Text truncated to: Telia N 11:41 4 38 % Lukk NYTTLUKSUSMERKE TIL NORGE Skal åpne Chanel- butikk i Oslo 5 GUCo Slår opp ...
⚠️ Error extracting text features: index out of range in self
⚠️ Text exceeds 514 tokens. Truncating: thebmj   Visual summary Co

In [142]:
# Find rows where image_text_features is NA
failed_rows = df[df["image_text_features"].isna()]

# Inspect the processed_image_text for these rows
for index, row in failed_rows.iterrows():
    print(f"Observation {index}:")
    print(f"Original Text (filtered_image_text): {row['filtered_image_text']}")
    print({row['label']})
    print("=" * 50)  # Separator for readability

Observation 153:
Original Text (filtered_image_text): ENTSO-E TYNDP 2022 System Needs Study July 2022 version for public consultation M5 nnl Additional capacity increases and flexibility assets by 2040 bring economic benefits and support security of supply By 2040,88 GW of additional cross-border capacity 20 Bn € /year for storage and 0.1 Bn €/year for peaking increases with respect to 2025 grid, 41 GW of units) The increase socio-economic welfare storage and GW of CO,-free peaking units would amounts to billion euro/year until 2040. Addressing be needed to support Europes move towards needs would also improve security of electricity supply carbon-free power system and ensure continuous as energy-not-served would be cut by 1.72 TWh/year. and cost-effective access to ectricity. With regards This in turn would increase socio-economic welfare cross-border capacity this represents 24 GW even further of additional cross-border capacity increases on In 2040 the highest needs for capacity inc

In [143]:
# Create the different feature sets
df["text_features"] = df["tweet"].apply(get_text_features)
# df["image_text_features"] = df["filtered_image_text"].apply(get_text_features)
df["image_features"] = df["image_path_image_filename_image_1"].apply(get_image_features)



In [163]:
print(df["text_features"].shape)
print(df["image_text_features"].shape)
print(df["image_features"].shape)

(3937,)
(3937,)
(3937,)


In [145]:
print(df.isnull().sum())

tweet                                   0
id                                      0
label                                   0
date                                    1
image_path_image_filename_image_1       0
image_path_image_filename_image_2    3640
image_path_image_filename_image_3    3815
image_path_image_filename_image_4    3859
processed_text                          0
processed_image_text                 2586
image_text                              0
filtered_image_text                     0
image_text_features                    32
text_features                           0
image_features                          0
dtype: int64


In [146]:
# replace the NAs in image_text_features with vectors from the rows where image_text is an empty string. 
empty_vector_example = df.loc[df["filtered_image_text"] == "", "image_text_features"].dropna().values

if len(empty_vector_example) > 0:
    empty_vector_example = empty_vector_example[0] 
    df.loc[df["image_text_features"].isna(), "image_text_features"] = df.loc[df["image_text_features"].isna(), "image_text_features"].apply(lambda _: empty_vector_example)

In [147]:
print(df["image_text_features"].head(20))

0     [0.0070521524, 0.09996615, 0.06716276, -0.1244...
1     [-0.0023984693, 0.08590933, -0.054364007, 0.12...
2     [0.26396376, -0.04853261, -0.06528268, -0.1353...
3     [-0.01885866, -0.05700396, -0.13053499, -0.168...
4     [-0.0109979035, 0.032760814, -0.24700762, -0.0...
5     [-0.017980304, 0.034670442, -0.059337832, 0.00...
6     [0.05160245, 0.02143021, -0.057980604, 0.03006...
7     [0.05160245, 0.02143021, -0.057980604, 0.03006...
8     [0.05160245, 0.02143021, -0.057980604, 0.03006...
9     [0.21786541, -0.005461399, -0.16227847, -0.005...
10    [0.05160245, 0.02143021, -0.057980604, 0.03006...
11    [0.09236577, -0.069525756, -0.15130168, 0.0079...
12    [0.022085156, -0.09810396, -0.029885922, 0.093...
13    [0.05160245, 0.02143021, -0.057980604, 0.03006...
14    [0.0006160557, 0.085205555, -0.022259243, 0.02...
15    [-0.032672875, 0.009454998, -0.06075813, 0.051...
16    [0.05160245, 0.02143021, -0.057980604, 0.03006...
17    [0.05160245, 0.02143021, -0.057980604, 0.0

In [162]:
print(df.isnull().sum())

tweet                                   0
id                                      0
label                                   0
date                                    1
image_path_image_filename_image_1       0
image_path_image_filename_image_2    3640
image_path_image_filename_image_3    3815
image_path_image_filename_image_4    3859
processed_text                          0
processed_image_text                 2586
image_text                              0
filtered_image_text                     0
image_text_features                     0
text_features                           0
image_features                          0
intra_text_euclidean                    0
intra_image_text_euclidean              0
intra_image_euclidean                   0
inter_t_i_cosine                        0
inter_t_ti_cosine                       0
inter_i_ti_cosine                       0
dtype: int64


## Intramodal as Euclidean distance

In [149]:
# Euclidean distance for intramodal incongruity:
from scipy.spatial.distance import pdist, squareform

def mean_pairwise_distance(vector):
    distances = pdist(vector.reshape(-1, 1), metric='euclidean')  # Compute all pairwise distances
    return np.mean(distances)

df["intra_text_euclidean"] = df["text_features"].apply(mean_pairwise_distance)
df["intra_image_text_euclidean"] = df["image_text_features"].apply(mean_pairwise_distance)
df["intra_image_euclidean"] = df["image_features"].apply(mean_pairwise_distance)

## Intermodal as cosine dissimilarity

In [150]:
from scipy.spatial.distance import cosine

df["inter_t_i_cosine"] = df.apply(lambda row: cosine(np.ravel(row["text_features"]), np.ravel(row["image_features"])), axis=1)
df["inter_t_ti_cosine"] = df.apply(lambda row: cosine(np.ravel(row["text_features"]), np.ravel(row["image_text_features"])), axis=1)
df["inter_i_ti_cosine"] = df.apply(lambda row: cosine(np.ravel(row["image_features"]), np.ravel(row["image_text_features"])), axis=1)

In [151]:
df.to_pickle("E:/covid_features_sample_unclean.pkl")
df.to_excel("E:/covid_features_sample_unclean.xlsx")

# Model training

In [None]:
from sklearn.preprocessing import StandardScaler

X = df[["intra_text_euclidean", "intra_image_text_euclidean", "intra_image_euclidean",
        "inter_t_i_cosine", "inter_t_ti_cosine", "inter_i_ti_cosine"]]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

## Logistic regression

In [166]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression() 
lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.67      0.78       690
           1       0.20      0.56      0.29        98

    accuracy                           0.66       788
   macro avg       0.56      0.62      0.53       788
weighted avg       0.83      0.66      0.71       788



## MLP

In [155]:
model = Sequential()

model.add(Dense(128, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)))  # input 
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))  # hidden 
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))  # hidden 
model.add(Dropout(0.5))

# Output
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [156]:
history = model.fit(X_train_res, y_train_res, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10


[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5792 - loss: 1.6511 - val_accuracy: 0.6548 - val_loss: 0.8976
Epoch 2/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6820 - loss: 0.8240 - val_accuracy: 0.6599 - val_loss: 0.6974
Epoch 3/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6814 - loss: 0.6873 - val_accuracy: 0.6091 - val_loss: 0.6942
Epoch 4/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6737 - loss: 0.6636 - val_accuracy: 0.7310 - val_loss: 0.5890
Epoch 5/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6850 - loss: 0.6493 - val_accuracy: 0.7069 - val_loss: 0.6091
Epoch 6/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6850 - loss: 0.6465 - val_accuracy: 0.6726 - val_loss: 0.6244
Epoch 7/10
[1m173/173[0m [32m━━━━━━━

In [157]:
y_pred_mlp = (model.predict(X_test) > 0.5).astype("int32")

print(classification_report(y_test, y_pred_mlp, target_names=["Not Humor", "Humor"]))

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
              precision    recall  f1-score   support

   Not Humor       0.94      0.61      0.74       690
       Humor       0.21      0.72      0.32        98

    accuracy                           0.63       788
   macro avg       0.57      0.67      0.53       788
weighted avg       0.85      0.63      0.69       788



## Random Forest

In [167]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)

y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88       690
           1       0.27      0.36      0.30        98

    accuracy                           0.80       788
   macro avg       0.58      0.61      0.59       788
weighted avg       0.82      0.80      0.81       788



In [131]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}

rf_model_grid = RandomForestClassifier(random_state=42)

randomized_search = RandomizedSearchCV(rf_model_grid, param_grid, n_iter=10, cv=5, n_jobs=-1, random_state=42)
randomized_search.fit(X_train_res, y_train_res)
print("Best Parameters:", randomized_search.best_params_)

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sirifris\.conda\envs\poli_reco\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sirifris\.conda\envs\poli_reco\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\sirifris\.conda\envs\poli_reco\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\sirifris\.conda\envs\poli_reco\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in

Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20}


In [132]:
best_rf_model = randomized_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

print(classification_report(y_test, y_pred_best_rf))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87       690
           1       0.21      0.29      0.24        98

    accuracy                           0.78       788
   macro avg       0.55      0.57      0.56       788
weighted avg       0.81      0.78      0.79       788



## SVM

In [168]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
svm.fit(X_train_res, y_train_res)

y_pred_svm = svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.92      0.73      0.82       690
           1       0.23      0.55      0.32        98

    accuracy                           0.71       788
   macro avg       0.57      0.64      0.57       788
weighted avg       0.83      0.71      0.76       788



# Polarity contrasts

In [172]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-base_sentence-sentiment", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("ltg/norbert3-base_sentence-sentiment", trust_remote_code=True)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

text = "Dette var en vakker film"

output = classifier(text)

print(output)

A new version of the following files was downloaded from https://huggingface.co/ltg/norbert3-base_sentence-sentiment:
- configuration_norbert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ltg/norbert3-base_sentence-sentiment:
- modeling_norbert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'Positive', 'score': 0.97533118724823}]


In [206]:
sentiment_tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-base_sentence-sentiment", trust_remote_code=True)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("ltg/norbert3-base_sentence-sentiment", trust_remote_code=True)
classifier = pipeline("text-classification", model=sentiment_model, tokenizer=sentiment_tokenizer, device=0 if device == "cuda" else -1)


In [221]:
from scipy.stats import entropy

SENTIMENT_LABELS = ['Negative', 'Positive', 'Neutral', 'Mixed']

def get_sentiment_distribution(text):
    """Returns a dict of sentiment probabilities for all labels"""
    if not text or not text.strip():
        return None  # no text present

    raw_output = classifier(text, truncation=True, top_k=None)

    # Ensure we have scores for all labels
    full_scores = {label: 0.0 for label in SENTIMENT_LABELS}
    if isinstance(raw_output, list):
        for entry in raw_output:
            # Ensure that entry is a dictionary with the 'label' and 'score'
            if isinstance(entry, dict):
                full_scores[entry["label"]] = entry["score"]

    return full_scores

def get_sentiment_distribution_safe(text):
    """Handle the case where text might be empty or missing."""
    if not text:  # Check if the text is empty or None
        return {
            'sentiment_tweet': {'Negative': 0.0, 'Positive': 0.0, 'Neutral': 0.0, 'Mixed': 0.0},
            'entropy_tweet': 0.0,
            'intermodal_contrast': 0.0
        }
    else:
        return get_sentiment_distribution(text)

def compute_entropy(score_dict):
    """Returns entropy (sentiment variety) in bits."""
    if score_dict is None:
        return None
    scores = np.array(list(score_dict.values()))
    return entropy(scores, base=2)

def compute_sentiment_contrast(tweet_text, superimposed_text=None):
    """Returns:
    - entropy_tweet: entropy of tweet text
    - entropy_imgtxt: entropy of superimposed image text
    - intermodal_contrast: absolute difference in entropy
    """
    sent_tweet = get_sentiment_distribution(tweet_text)
    entropy_tweet = compute_entropy(sent_tweet)

    if superimposed_text:
        sent_imgtxt = get_sentiment_distribution(superimposed_text)
        entropy_imgtxt = compute_entropy(sent_imgtxt)
        intermodal_contrast = abs(entropy_tweet - entropy_imgtxt) if entropy_tweet is not None and entropy_imgtxt is not None else None
    else:
        sent_imgtxt = None
        entropy_imgtxt = None
        intermodal_contrast = None

    return {
        "sentiment_tweet": sent_tweet,
        "sentiment_imgtxt": sent_imgtxt,
        "entropy_tweet": entropy_tweet,
        "entropy_imgtxt": entropy_imgtxt,
        "intermodal_contrast": intermodal_contrast
    }

In [222]:
example = compute_sentiment_contrast(
    tweet_text="Dette var en vakker film",
    superimposed_text="Men regjeringen suger balle"
)

print(example)

{'sentiment_tweet': {'Negative': 0.0015079244039952755, 'Positive': 0.97533118724823, 'Neutral': 0.01738564670085907, 'Mixed': 0.005775235127657652}, 'sentiment_imgtxt': {'Negative': 0.8463519215583801, 'Positive': 0.04564889892935753, 'Neutral': 0.09598658978939056, 'Mixed': 0.012012475170195103}, 'entropy_tweet': 0.19386092520433587, 'entropy_imgtxt': 0.8081433066857755, 'intermodal_contrast': 0.6142823814814397}


In [223]:
example = compute_sentiment_contrast(
    tweet_text="Dette var en vakker film",
    superimposed_text=""
)

print(example)

{'sentiment_tweet': {'Negative': 0.0015079244039952755, 'Positive': 0.97533118724823, 'Neutral': 0.01738564670085907, 'Mixed': 0.005775235127657652}, 'sentiment_imgtxt': None, 'entropy_tweet': 0.19386092520433587, 'entropy_imgtxt': None, 'intermodal_contrast': None}


In [None]:
def process_df(df):
    results = []
    
    for _, row in df.iterrows():
        tweet = row['tweet']
        image_text = row['filtered_image_text']
        
        # Get sentiment and contrast values for tweet and image text
 
        intermodal_cont = compute_sentiment_contrast(tweet, image_text)
        
        # Collect the results as a dictionary
        result = {
            'tweet': tweet,
            'filtered_image_text': image_text,
            'sentiment_tweet': intermodal_cont['sentiment_tweet'],
            'sentiment_imgtxt': intermodal_cont['sentiment_imgtxt'],
            'entropy_tweet': intermodal_cont['entropy_tweet'],
            'entropy_imgtxt': intermodal_cont['entropy_imgtxt'],
            'intermodal_contrast': intermodal_cont['intermodal_contrast']
        }
        
        results.append(result)
    
    # Convert the results list into a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

# Now apply this function on your DataFrame
df_with_sentiment = process_df(df)

# Optionally, inspect the first few rows
print(df_with_sentiment.head())

                                               tweet  \
0  USERNAME Hva vil normalen være? Ser Rt fra Hon...   
1  Nå er det påvist #Covid_19 i verdens største f...   
2  Skandale. Tenk at noen faktisk følger helsemyn...   
3  Dette er New York på den verste dagen (13.apri...   
4  Selv ikke Heilag Olav blir skånet av korona-vi...   

                                 filtered_image_text  \
0  Diagramtittel 300 250 200 150 100 50 2005 2006...   
1  íjH: +bb 09 we we 45ft 86 08"9u} 957 4 GPF FRI...   
2  Resett @resettno 4 m 0 Nyhetsreporter møtte ab...   
3  "ll N Telenor 06:56 75 % AA vg.no VG VG LIVE T...   
4  14:32 4G AA stiklestad.no stiklestad Spelet om...   

                                     sentiment_tweet  \
0  {'Negative': 0.034650158137083054, 'Positive':...   
1  {'Negative': 0.036310065537691116, 'Positive':...   
2  {'Negative': 0.5118622183799744, 'Positive': 0...   
3  {'Negative': 0.03331822529435158, 'Positive': ...   
4  {'Negative': 0.08304567635059357, 'Positive

In [None]:
df = pd.concat([df, df_with_sentiment.drop(["tweet", "filtered_image_text"], axis=1)], axis=1)

In [225]:
df.to_pickle("E:/covid_features_sample_unclean_polarity.pkl")
df.to_excel("E:/covid_features_sample_unclean_polarity.xlsx")

# Modeling

In [238]:
from sklearn.preprocessing import StandardScaler

X = df[["intra_text_euclidean", "intra_image_text_euclidean", "intra_image_euclidean",
        "inter_t_i_cosine", "inter_t_ti_cosine", "inter_i_ti_cosine",
        "entropy_tweet", "entropy_imgtxt", "intermodal_contrast"]]
y = df["label"]

In [239]:
X = X.fillna(X.median())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [240]:
print(X.head(20))

    intra_text_euclidean  intra_image_text_euclidean  intra_image_euclidean  \
0               0.230646                    0.215453               0.400644   
1               0.243858                    0.176869               0.419029   
2               0.218537                    0.232959               0.434566   
3               0.229966                    0.194388               0.401643   
4               0.260089                    0.193764               0.404472   
5               0.206413                    0.212481               0.434198   
6               0.231506                    0.136660               0.404372   
7               0.259622                    0.136660               0.398886   
8               0.236215                    0.136660               0.413554   
9               0.222984                    0.235562               0.393000   
10              0.251056                    0.136660               0.405647   
11              0.222581                    0.228194

## Logistic regression

In [241]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression() 
lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.68      0.79       690
           1       0.21      0.60      0.31        98

    accuracy                           0.67       788
   macro avg       0.57      0.64      0.55       788
weighted avg       0.84      0.67      0.73       788



## MLP

In [242]:
model = Sequential()

model.add(Dense(128, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)))  # input 
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))  # hidden 
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))  # hidden 
model.add(Dropout(0.5))

# Output
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [243]:
history = model.fit(X_train_res, y_train_res, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5303 - loss: 1.7526 - val_accuracy: 0.6980 - val_loss: 0.9142
Epoch 2/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6726 - loss: 0.8609 - val_accuracy: 0.7030 - val_loss: 0.6414
Epoch 3/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6977 - loss: 0.6885 - val_accuracy: 0.6764 - val_loss: 0.6294
Epoch 4/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7096 - loss: 0.6465 - val_accuracy: 0.6739 - val_loss: 0.6341
Epoch 5/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7140 - loss: 0.6336 - val_accuracy: 0.6992 - val_loss: 0.5969
Epoch 6/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7161 - loss: 0.6225 - val_accuracy: 0.6904 - val_loss: 0.5998
Epoch 7/10
[1m173/173[0m 

In [244]:
y_pred_mlp = (model.predict(X_test) > 0.5).astype("int32")

print(classification_report(y_test, y_pred_mlp, target_names=["Not Humor", "Humor"]))

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
              precision    recall  f1-score   support

   Not Humor       0.95      0.63      0.76       690
       Humor       0.23      0.78      0.35        98

    accuracy                           0.65       788
   macro avg       0.59      0.70      0.56       788
weighted avg       0.86      0.65      0.71       788



## Random Forest

In [245]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)

y_pred_rf = rf_model.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87       690
           1       0.20      0.26      0.22        98

    accuracy                           0.78       788
   macro avg       0.54      0.56      0.55       788
weighted avg       0.80      0.78      0.79       788



## SVM

In [246]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
svm.fit(X_train_res, y_train_res)

y_pred_svm = svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.92      0.72      0.81       690
           1       0.22      0.54      0.31        98

    accuracy                           0.70       788
   macro avg       0.57      0.63      0.56       788
weighted avg       0.83      0.70      0.75       788



# Labeling 

Extracting the features for the entire data, to be used for the labeling

In [57]:
# loading the data
df = pd.read_excel("D:/Data/Datasets/Poli_reco_datasets/image_text_ano_nort.xlsx")

df["image_path_image_filename_image_1"] = df["image_path_image_filename_image_1"].str.replace(r"^D:/", "E:/", regex=True)

## superimposed text extraction

In [None]:
# Extract text from images
import PIL
PIL.Image.ANTIALIAS = PIL.Image.LANCZOS
df["image_text"] = df["image_path_image_filename_image_1"].apply(extract_text_easyocr)



In [70]:
DetectorFactory.seed = 0

def filter_image_text_by_language(text):
    """Filter out texts that are neither Norwegian nor English."""
    try:
        lang = detect(text)
        if lang not in ['no', 'en']:
            return "" 
        else:
            return text 
    except Exception as e:
        return ""  # Return empty string for errors

df["filtered_image_text"] = df["image_text"].apply(filter_image_text_by_language)

In [71]:
# Check how many texts contain an empty string
print(df["filtered_image_text"].apply(lambda x: x == "").sum())

11639


In [72]:
print(df[df["filtered_image_text"] == ""]["image_text"].head(50))

0     MATT LEACOCK PANDEMIC Ericy KAN DU REDDE VERDE...
1                                                 IIF9K
4               'Extra Corona Corona Corona Exura Exlra
7                                                 IIF9K
8                                        LERIEXSKESKIIS
10                                                IIF9K
11                                                     
12                                                IIF9K
13                                                IIF9K
14                        se för Coronaviruset! upp 1 !
15    Conseil des ministres Fans Iendiredı Janvicr 2...
16                                                IIF9K
17                                                IIF9K
18                                              ALVIV F
22                                                IIF9K
24                                                     
25                                               SvS |d
28                                              

## Features

In [None]:
df = df.fillna("")

In [None]:
# Create the different feature sets
df["text_features"] = df["processed_tweet"].apply(get_text_features)
df["image_text_features"] = df["processed_image_text"].apply(get_text_features)
df["image_features"] = df["image_path_image_filename_image_1"].apply(get_image_features)

Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors


⚠️ Text exceeds 514 tokens. Truncating: Telia N 11:41 4 38 % Lukk NYTTLUKSUSMERKE TIL NORGE Skal åpne Chanel- butikk i Oslo 5 GUCo Slår opp ...
⚠️ Text truncated to: Telia N 11:41 4 38 % Lukk NYTTLUKSUSMERKE TIL NORGE Skal åpne Chanel- butikk i Oslo 5 GUCo Slår opp ...
⚠️ Error extracting text features: index out of range in self
⚠️ Text exceeds 514 tokens. Truncating: an lorrioc uko u roll chclsc quuai Sllie donnc lorklarende rPortasj 7 Tallcncna Karvlscr qnrao folge...
⚠️ Text truncated to: an lorrioc uko u roll chclsc quuai Sllie donnc lorklarende rPortasj 7 Tallcncna Karvlscr qnrao folge...
⚠️ Text exceeds 514 tokens. Truncating: Coronavirus Disease 2019 and Influenza Worldwide there is great concern about the emerging epidemic ...
⚠️ Text truncated to: Coronavirus Disease 2019 and Influenza Worldwide there is great concern about the emerging epidemic ...
⚠️ Error extracting text features: index out of range in self
⚠️ Text exceeds 514 tokens. Truncating: Datteren min kommer gråten



In [77]:
print(df["text_features"].shape)
print(df["image_text_features"].shape)
print(df["image_features"].shape)

(27210,)
(27210,)
(27210,)


In [78]:
print(df.isnull().sum())

tweet                                    0
id                                       0
author_hash                              0
in_reply_to_user_hash                    0
conversation_id                          0
created_at                               0
retweet_count                            0
reply_count                              0
like_count                               0
quote_count                              0
impression_count                         0
url_1                                    0
url_2                                13383
url_3                                22738
url_4                                26507
url_5                                27020
url_6                                27195
url_7                                27209
url_8                                27210
date                                     0
unnest_referenced_tweets_id          27210
label                                    0
date_num                                 0
image_exist

In [79]:
# replace the NAs in image_text_features with vectors from the rows where image_text is an empty string. 
empty_vector_example = df.loc[df["image_text"] == "", "image_text_features"].dropna().values

if len(empty_vector_example) > 0:
    empty_vector_example = empty_vector_example[0] 
    df.loc[df["image_text_features"].isna(), "image_text_features"] = df.loc[df["image_text_features"].isna(), "image_text_features"].apply(lambda _: empty_vector_example)

In [80]:
print(df.isnull().sum())

tweet                                    0
id                                       0
author_hash                              0
in_reply_to_user_hash                    0
conversation_id                          0
created_at                               0
retweet_count                            0
reply_count                              0
like_count                               0
quote_count                              0
impression_count                         0
url_1                                    0
url_2                                13383
url_3                                22738
url_4                                26507
url_5                                27020
url_6                                27195
url_7                                27209
url_8                                27210
date                                     0
unnest_referenced_tweets_id          27210
label                                    0
date_num                                 0
image_exist

In [81]:
# Euclidean distance for intramodal incongruity:
from scipy.spatial.distance import pdist, squareform

def mean_pairwise_distance(vector):
    distances = pdist(vector.reshape(-1, 1), metric='euclidean')  # Compute all pairwise distances
    return np.mean(distances)

df["intra_text_euclidean"] = df["text_features"].apply(mean_pairwise_distance)
df["intra_image_text_euclidean"] = df["image_text_features"].apply(mean_pairwise_distance)
df["intra_image_euclidean"] = df["image_features"].apply(mean_pairwise_distance)

In [82]:
# cosine dissimilarity for intermodal incongruity
from scipy.spatial.distance import cosine

df["inter_t_i_cosine"] = df.apply(lambda row: cosine(np.ravel(row["text_features"]), np.ravel(row["image_features"])), axis=1)
df["inter_t_ti_cosine"] = df.apply(lambda row: cosine(np.ravel(row["text_features"]), np.ravel(row["image_text_features"])), axis=1)
df["inter_i_ti_cosine"] = df.apply(lambda row: cosine(np.ravel(row["image_features"]), np.ravel(row["image_text_features"])), axis=1)

In [None]:
df.to_excel("E:/covid_features_all.xlsx")
df.to_pickle("E:/covid_features_all.pkl")

## polarity

In [251]:
df_test = pd.read_excel("E:/covid_features_all.xlsx")
df_test = df_test.fillna("")

In [252]:
print(df_test["filtered_image_text"].head(10))


0                                                     
1                                                     
2    Dagbladet LOGG INN = Kan bli epidemi: Høy døde...
3    Telia N 20:12 4 20 % dagbladet.no Dagbladet LO...
4                                                     
5    AKKURAT NÅ AKKURAT NÅ AKKURAT NÅ AKKURAT NÅ AK...
6    SSPX S8P 500 Large Cap Index IHDX @Stocl Chant...
7                                                     
8                                                     
9    LowestPricIver Offered  NOW LovestIerms Ever Q...
Name: filtered_image_text, dtype: object


In [253]:
def process_df(df_test):
    results = []
    
    for _, row in df_test.iterrows():
        tweet = row['tweet']
        image_text = row['filtered_image_text']
        
        # Get sentiment and contrast values for tweet and image text
 
        intermodal_cont = compute_sentiment_contrast(tweet, image_text)
        
        # Collect the results as a dictionary
        result = {
            'tweet': tweet,
            'filtered_image_text': image_text,
            'sentiment_tweet': intermodal_cont['sentiment_tweet'],
            'sentiment_imgtxt': intermodal_cont['sentiment_imgtxt'],
            'entropy_tweet': intermodal_cont['entropy_tweet'],
            'entropy_imgtxt': intermodal_cont['entropy_imgtxt'],
            'intermodal_contrast': intermodal_cont['intermodal_contrast']
        }
        
        results.append(result)
    
    # Convert the results list into a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

# Now apply this function on your DataFrame
df_with_sentiment = process_df(df_test)

# Optionally, inspect the first few rows
print(df_with_sentiment.head())

                                               tweet  \
0  Artig brettspill, men vi klarte ikke oppdraget...   
1  Seks døde av nytt corona-virus https://t.co/Y8...   
2  Hvorfor ikke kalle en plastikk-skje for veihøv...   
3  #dagbladet 1-2000 mennesker dør hvert år av in...   
4  USERNAME Godt jeg har tatt influensavaksine og...   

                                 filtered_image_text  \
0                                                      
1                                                      
2  Dagbladet LOGG INN = Kan bli epidemi: Høy døde...   
3  Telia N 20:12 4 20 % dagbladet.no Dagbladet LO...   
4                                                      

                                     sentiment_tweet  \
0  {'Negative': 0.07125683128833771, 'Positive': ...   
1  {'Negative': 0.03703628107905388, 'Positive': ...   
2  {'Negative': 0.15115077793598175, 'Positive': ...   
3  {'Negative': 0.08712852746248245, 'Positive': ...   
4  {'Negative': 0.03913278505206108, 'Positive

In [254]:
df_test = pd.concat([df_test, df_with_sentiment.drop(["tweet", "filtered_image_text"], axis=1)], axis=1)

In [259]:
print(df_test["entropy_imgtxt"].head(10))

0         NaN
1         NaN
2    0.264055
3    0.241080
4         NaN
5    0.294156
6    0.420579
7         NaN
8         NaN
9    0.377545
Name: entropy_imgtxt, dtype: float64


In [267]:
df_test.to_pickle(r"E:/covid_features_all_unclean_polarity.pkl")

In [262]:
df_test.to_excel("E:/covid_features_all_unclean_polarity.xlsx")

Exception ignored in: <function ZipFile.__del__ at 0x00000223B8D29B40>
Traceback (most recent call last):
  File "c:\Users\sirifris\.conda\envs\poli_reco\lib\zipfile.py", line 1834, in __del__
    self.close()
  File "c:\Users\sirifris\.conda\envs\poli_reco\lib\zipfile.py", line 1851, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


In [268]:
df_check = pd.read_pickle("E:/covid_features_all_unclean_polarity.pkl")

In [269]:
print(df_check.head())

   Unnamed: 0                                              tweet  \
0           0  Artig brettspill, men vi klarte ikke oppdraget...   
1           1  Seks døde av nytt corona-virus https://t.co/Y8...   
2           2  Hvorfor ikke kalle en plastikk-skje for veihøv...   
3           3  #dagbladet 1-2000 mennesker dør hvert år av in...   
4           4  USERNAME Godt jeg har tatt influensavaksine og...   

                    id                       author_hash  \
0  1216127779047165952  f6db774f61dcecce8585975b4454d23e   
1  1219560793491919104  fd43a7278609c825548b319b3339a0de   
2  1220032938936499968  3538b77d93871bfe5fc8d213e40d6ed1   
3  1220063326954234112  091d9c6c3525813589c293f470be6ffd   
4  1220204296387644928  ce6744a309216d3a464fbee031d22057   

              in_reply_to_user_hash      conversation_id  \
0  55611e71b358a30158c61810ad802435  1216127779047165952   
1  55611e71b358a30158c61810ad802435  1219560793491919104   
2  55611e71b358a30158c61810ad802435  1220032938936

In [265]:
print(df_test.head())

   Unnamed: 0                                              tweet  \
0           0  Artig brettspill, men vi klarte ikke oppdraget...   
1           1  Seks døde av nytt corona-virus https://t.co/Y8...   
2           2  Hvorfor ikke kalle en plastikk-skje for veihøv...   
3           3  #dagbladet 1-2000 mennesker dør hvert år av in...   
4           4  USERNAME Godt jeg har tatt influensavaksine og...   

                    id                       author_hash  \
0  1216127779047165952  f6db774f61dcecce8585975b4454d23e   
1  1219560793491919104  fd43a7278609c825548b319b3339a0de   
2  1220032938936499968  3538b77d93871bfe5fc8d213e40d6ed1   
3  1220063326954234112  091d9c6c3525813589c293f470be6ffd   
4  1220204296387644928  ce6744a309216d3a464fbee031d22057   

              in_reply_to_user_hash      conversation_id  \
0  55611e71b358a30158c61810ad802435  1216127779047165952   
1  55611e71b358a30158c61810ad802435  1219560793491919104   
2  55611e71b358a30158c61810ad802435  1220032938936