## Read and Split Data

In [1]:
# Read data here
# Extract zip file if folder does not already exist
import os
import zipfile
import json

folder_path = 'data/CT23_1A_checkworthy_multimodal_english_v2'
zip_file_path = 'data/CT23_1A_checkworthy_multimodal_english_v2.zip'

def zip_extration(folder_path, zip_file_path):
    print('Zip file extraction started')
    if not os.path.exists(folder_path):
        print('Folder does not exist, extracting zip file')
        os.makedirs(folder_path)

        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(folder_path)
    
    print('Zip file extracted')

zip_extration(folder_path, zip_file_path)

train_path = folder_path + '/CT23_1A_checkworthy_multimodal_english_train.jsonl'
test_path = folder_path + '/CT23_1A_checkworthy_multimodal_english_test.jsonl'

def split_json(data):
    text_data = {
        'tweet_id': data['tweet_id'],
        'tweet_url': data['tweet_url'],
        'tweet_text': data['tweet_text'],
        'ocr_text': data['ocr_text'],
        'class_label': data['class_label']
    }

    image_data = {
        'tweet_id': data['tweet_id'],
        'tweet_url': data['tweet_url'],
        'class_label': data['class_label'],
        'image_path': data['image_path'],
        'image_url': data['image_url']
    }

    return text_data, image_data


# Read data from the folder
def read_data(file_path):
    text_data = []
    image_data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            text, image = split_json(json_obj)
            text_data.append(text)
            image_data.append(image)
    return text_data, image_data

train_text_data, train_image_data = read_data(train_path)
print(f'Text: {train_text_data[0]}')
print(f'Image: {train_image_data[0]}')


Zip file extraction started
Zip file extracted
Image: {'tweet_id': '1222845188567003136', 'tweet_url': 'https://twitter.com/user/status/1222845188567003136', 'class_label': 'Yes', 'image_path': 'images_labeled/train/1222845188567003136.jpg', 'image_url': 'http://pbs.twimg.com/media/EPhrN-SU4AAKvGf.jpg'}


## Clean Data

In [2]:
# Clean data here (maybe lemmatizer and such)


## Run Text Analyzer Model

In [3]:
try:
    import transformers
except ImportError:
    %pip install transformers
# Train model here
# Use RoBERTa model or GPT model from Hugging Face

  from .autonotebook import tqdm as notebook_tqdm


## Run Image Analyser Model

In [18]:
# Train model here
# Use VIT model from Hugging Face
try:
    from transformers import ViTFeatureExtractor, ViTModel, RobertaTokenizer, RobertaModel
    import torch
    import torch.nn as nn
except ImportError:
    %pip install transformers
    %pip install torch
    %pip install pillow

from PIL import Image
# Train model here

# Text model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
text_model = RobertaModel.from_pretrained("roberta-base")

# ViT model from Hugging Face
image_processor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')


image_path = folder_path + '/' + train_image_data[0]['image_path']
image = Image.open(image_path)

text = train_text_data[0]['tweet_text'] + ' ' + train_text_data[0]['ocr_text']

# Text input and output
text_inputs = tokenizer(text, return_tensors="pt", padding=True)
text_outputs = text_model(**text_inputs)

# Image input and output
image_inputs = image_processor(images=image, return_tensors="pt")
image_outputs = image_model(pixel_values=image_inputs.pixel_values)

# Combine text and image features (e.g., concatenation)
combined_features = torch.cat([text_outputs.last_hidden_state.mean(dim=1), image_outputs.last_hidden_state.mean(dim=1)], dim=1)


# Define the classification model
classification_model = nn.Sequential(
    nn.Linear(combined_features.shape[-1], 2)  # Output size is 2 for binary classification
)

# Forward pass to obtain logits
logits = classification_model(combined_features)

# Apply softmax activation function to obtain probabilities
probabilities = nn.functional.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=-1)

# Convert predicted class to "Yes" or "No"
predicted_label = ["Yes", "No"][predicted_class.item()]

print("Predicted label:", predicted_label)
print("Ground Truth:", train_text_data[0]['class_label'])




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted label: No
Ground Truth: Yes


## Concatenate The Two Models

In [None]:
# Concatenate models here two get the image and text results together