# Topic Detection

image folder represents the folder in which the images are placed

In [1]:
image_folder = 'untitled folder/gossipcop_images'

In [None]:
!pip install Pillow
!pip install torch
!pip install torchvision

from PIL import Image
import os
import json
from torch.utils.data import Dataset, DataLoader  # Import DataLoader
import torchvision.transforms as transforms

The CustomDataset class includes several functions, including load_data, which allows you to read the different JSON files in the GossipCop dataset.

In [None]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, image_folder, transform=None):
        self.root_dir = root_dir
        self.image_folder = image_folder
        self.transform = transform
        self.data = self.load_data()

    def load_data(self):
        data = []

        for folder_name in os.listdir(self.root_dir)[:5000]:
            folder_path = os.path.join(self.root_dir, folder_name)
            json_path = os.path.join(folder_path, 'tweets.json')

            if os.path.exists(json_path):
                try:
                    with open(json_path, 'r') as f:
                        tweets_data_list = json.load(f)

                    tweet_number = folder_name.split('-')[-1]
                    image_name = f'{tweet_number}.jpg'
                    image_path = os.path.join(self.image_folder, image_name)

                    if os.path.exists(image_path) and self.is_valid_image(image_path):
                        data.append({'text': tweets_data_list, 'image_path': image_path})

                except json.JSONDecodeError as json_error:
                    print(f"Error decoding JSON in file {json_path}: {json_error}")
            else:
                print(f"JSON file not found: {json_path}")

        return data

    def is_valid_image(self, file_path):
        try:
            # Tenta di aprire il file immagine con PIL
            img = Image.open(file_path)
            img.verify()
            return True
        except Exception as e:
            print(f"Invalid image file: {file_path}. Error: {e}")
            return False

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        image_path = self.data[idx]['image_path']

        try:
            image = Image.open(image_path).convert('RGB')

            if self.transform:
                image = self.transform(image)

            return {'text': text, 'image': image}

        except OSError as e:
            print(f"Error opening image file {image_path}: {e}")
            return None

Real data is uploaded

In [None]:

root_folder = 'gossipcop_real'
image_folder = 'gossipcop_images'
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = CustomDataset(root_folder, image_folder, transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

Fake data is uploaded

In [None]:
# Esempio di utilizzo
root_folder = 'gossipcop_fake'
image_folder = 'gossipcop_images'
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset2 = CustomDataset(root_folder, image_folder, transform=transform)
dataloader2 = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
import numpy as np

texts = []
images = []

for batch in dataset:
    text = batch['text']
    image = batch['image']


    if 'tweets' in text and text['tweets'] and len(text['tweets']) > 0:
        texts.append(text['tweets'][0]['text'])
    else:
        texts.append("")

    images.append(image)


texts = np.array(texts)
images = np.array(images)

In [None]:
texts2 = []
images2 = []
for batch2 in dataset2:
    text2 = batch2['text']
    image2 = batch2['image']


    if 'tweets' in text2 and text2['tweets'] and len(text2['tweets']) > 0:
        texts2.append(text2['tweets'][0]['text'])
    else:
        texts2.append("")

    images2.append(image2)


texts2 = np.array(texts2)
images2 = np.array(images2)

In [None]:
ima = np.concatenate([images, images2], axis=0)
print(len(ima))

In [None]:
tex = np.concatenate([texts, texts2], axis=0)

print(len(tex))

In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

def truncate_texts(texts, max_length=70):
    truncated_texts = []

    for text in texts:
        tokens = word_tokenize(text)
        truncated_tokens = tokens[:max_length]
        truncated_text = " ".join(truncated_tokens)
        truncated_texts.append(truncated_text)

    return np.array(truncated_texts)


truncated_texts = truncate_texts(tex)

print(truncated_texts)

In [None]:
tex=truncated_texts

In [None]:
from bertopic import BERTopic
from bertopic.representation import VisualRepresentation

# Additional ways of representing a topic
visual_model = VisualRepresentation()

# Make sure to add the `visual_model` to a dictionary
representation_model = {
   "Visual_Aspect":  visual_model,
}
topic_model = BERTopic(representation_model=representation_model, verbose=True)

Embeddings of images and text in the same space are generated

In [None]:
from bertopic.backend import MultiModalBackend
model = MultiModalBackend('clip-ViT-B-32', batch_size=32)

# Embed both images and documents, then average them
doc_image_embeddings = model.embed(tex, ima)

In [None]:
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
vectorizer_model = CountVectorizer(stop_words="english")
repr_model = MaximalMarginalRelevance(diversity = 0.3,  top_n_words = 15)
topic_model = BERTopic(vectorizer_model=vectorizer_model, representation_model=repr_model, verbose=True, min_topic_size = 50,  top_n_words = 15)
topics, probs = topic_model.fit_transform(tex, doc_image_embeddings)
captions["Topic"] = topics

In [None]:
df_docs["Name"].unique()

In [None]:
topic_model.save(name, save_embedding_model=False)