# DeepLearning Hate Speech
This notebook contains our project implementation, the instructions to run the notebook and explanations.

#### 1.1 Imports

In [2]:
import os
import json
import random

In [3]:
# Define the directory paths
destination_dir  = "hateful_memes/"
train_json       = "hateful_memes/train.jsonl"
test_seen_json   = "hateful_memes/test_seen.jsonl"
test_unseen_json = "hateful_memes/test_unseen.jsonl"

In [4]:
keywords = {
    "Womens"   : ["woman", "she", "her", "female", "feminine", "lady", "girl", "feminist", "queen", "mother", "daughter"],
    "Africans" : ["black", "afro", "african", "ebony", "panafrican", "afroamerican", "nubian", "melanin"],
    "Muslims"  : ["muslim", "arab", "coran", "islam", "islamic", "muslimah", "hijab", "sunni", "shiite", "halal", "mosque"],
    "LGBTQ"    : ["lgbt", "gay", "homo", "lesbian", "bisexual", "transgender", "queer", "pride", "rainbow", "drag", "trans", "nonbinary"],
    "Jews"     : ["jewish", "jew", "hebrew", "israeli", "yiddish", "semitic", "kosher", "rabbi"],
    "Politics" : ["activist", "protest", "demonstration", "rally", "activism", "campaign", "cause", "resist", "justice", "rights"],
    "Asians"   : ["asian", "chinese", "oriental", "eastasian", "japanese", "korean", "vietnamese", "filipino", "anime", "manga"],
    "Disable"  : ["disabled", "retarded", "handicap", "disability", "impaired", "specialneeds", "wheelchair", "autism", "dyslexia"],
    "Homeless" : ["homeless", "poor", "beggar", "vagrant", "homelessness", "poverty", "shelter", "unemployed", "panhandler"],
    "Natives " : ["indigenous", "nativeamerican", "firstnations", "aboriginal", "tribal", "native", "reservation", "tribe"],
    "Latinos " : ["latino", "hispanic", "chicano", "mestizo", "latina", "latinx", "boricua", "tejano"],
}

def count_keywords(file_path, keywords, verbosity = 0):
    """
    This funtion reads a JSON file line by line, and counts the number of occurrences of keywords related to a topic in the text field of the JSON object.
    It also stores the ids of the memes that contain a keyword, counts the number of labels 0 and 1 for each meme found and computes the harm rate.
    Args:
        file_path: str, path to the JSON file
        keywords: dict, dictionary of keywords to search for in the specified file
        verbosity: int, level of verbosity for output
    Output:
        keyword_info: dict, dictionary containing the counts, ids, and label counts for each keyword
    """
    # Initialize a dictionary to store the counts, ids, and label counts
    keyword_info = {key: {'count': 0, 'ids': [], 'harm_rate': 0, 'label_0': 0, 'label_1': 0} for key in keywords.keys()}
    
    # Open the JSON file
    with open(file_path, 'r') as f:
        # Iterate over the lines in the file
        for line in f:
            # Load the JSON object from the line
            obj = json.loads(line)
            # Split the text into words
            words = obj['text'].split()

            # Check each word against the keywords
            for word in words:
                for topic, topic_keywords in keywords.items():
                    if word.lower() in topic_keywords:
                        # Append the id to the corresponding topic in the ids dictionary
                        if obj['id'] not in keyword_info[topic]['ids']:
                            keyword_info[topic]['ids'].append(obj['id'])
                            keyword_info[topic]['count'] += 1
                            # Count the labels
                            if obj['label'] == 0:
                                keyword_info[topic]['label_0'] += 1
                            elif obj['label'] == 1:
                                keyword_info[topic]['label_1'] += 1
                        break  # Stop checking other keywords once a keyword is found
    # Compute the harm rate
    for topic in keyword_info.keys():
        #check if count is zero to avoid division by zero
        if keyword_info[topic]['count'] == 0:
            harm_rate = 0
        else:
            harm_rate = round(keyword_info[topic]['label_1'] / keyword_info[topic]['count'],2)
        keyword_info[topic]['harm_rate'] = harm_rate
    if verbosity > 0:
        print("File:", file_path[14:])
        for topic, info in keyword_info.items():
            print(f"{topic.ljust(8)}:  {str(info['count']).ljust(3)}  Harm Rate:  {info['harm_rate']:.2f}  Label 0:  {str(info['label_0']).ljust(3)}  Label 1:  {info['label_1']}")
        print("\n")
    return keyword_info

count_keywords(train_json, keywords, verbosity = 1);
# count_keywords(test_seen_json, keywords, verbosity = 1);
count_keywords(test_unseen_json, keywords, verbosity = 1);

File: train.jsonl
Womens  :  524  Harm Rate:  0.39  Label 0:  321  Label 1:  203
Africans:  372  Harm Rate:  0.69  Label 0:  116  Label 1:  256
Muslims :  332  Harm Rate:  0.72  Label 0:  94   Label 1:  238
LGBTQ   :  150  Harm Rate:  0.75  Label 0:  37   Label 1:  113
Jews    :  100  Harm Rate:  0.70  Label 0:  30   Label 1:  70
Politics:  62   Harm Rate:  0.40  Label 0:  37   Label 1:  25
Asians  :  60   Harm Rate:  0.63  Label 0:  22   Label 1:  38
Disable :  46   Harm Rate:  0.80  Label 0:  9    Label 1:  37
Homeless:  31   Harm Rate:  0.39  Label 0:  19   Label 1:  12
Natives :  15   Harm Rate:  0.27  Label 0:  11   Label 1:  4
Latinos :  1    Harm Rate:  1.00  Label 0:  0    Label 1:  1


File: test_unseen.jsonl
Womens  :  102  Harm Rate:  0.43  Label 0:  58   Label 1:  44
Africans:  54   Harm Rate:  0.59  Label 0:  22   Label 1:  32
Muslims :  47   Harm Rate:  0.49  Label 0:  24   Label 1:  23
LGBTQ   :  22   Harm Rate:  0.50  Label 0:  11   Label 1:  11
Jews    :  35   Harm Rat

We set the minimum number of memes for a topic to become a class to 300. Thus, only the topics "Womens", "African" and "Muslims" are kept as class.

In [7]:
def create_class_files(file_path, keyword_info, destination_dir, mode):
    """
    This function creates .jsonl files for each topic in the keyword_info dictionary that is big enough.
    It combines the memes associated with the topic with an equal number of memes not associated with the topic.
    args:
        file_path: str, path to the original file
        keyword_info: dict, dictionary containing the keyword information
        destination_dir: str, directory to save the new files
    returns:
        None
    """
    if mode == "train":
        threshold = 300
    else:
        threshold = 46

    # Load all objects from the original file into a list
    with open(file_path, 'r') as f:
        all_objects = [json.loads(line) for line in f]

    # Iterate over each topic in keyword_info
    for topic, info in keyword_info.items():
        # Check if the count of the topic is greater than the threshold
        if info['count'] >= threshold:
            # Get all objects associated with this topic
            topic_objects = [obj for obj in all_objects if obj['id'] in info['ids']]
            # Get all objects not associated with this topic
            non_topic_objects = [obj for obj in all_objects if obj['id'] not in info['ids']]
            # Randomly select a similar amount of non-topic objects
            non_topic_objects = random.sample(non_topic_objects, len(topic_objects))
            # Force the label of these non-topic objects to 0
            for obj in non_topic_objects:
                obj['label'] = 0
            # Combine the topic and non-topic objects
            combined_objects = topic_objects + non_topic_objects
            random.shuffle(combined_objects)
            if mode == "train":
                # 80% for training, 20% for validation
                train_size = int(0.8 * len(combined_objects))
                train_objects = combined_objects[:train_size]
                validation_size = len(combined_objects) - train_size
                validation_objects = combined_objects[train_size:]
                # Create new file names with the destination directory
                file_name_train = os.path.join(destination_dir, f'{topic}_train.jsonl')
                file_name_val = os.path.join(destination_dir, f'{topic}_val.jsonl')
                # Write the combined objects to a new .jsonl file
                with open(file_name_train, 'w') as f:
                    for obj in train_objects:
                        f.write(json.dumps(obj) + '\n')
                print(f"File written: {file_name_train}, {len(train_objects)} memes")
                with open(file_name_val, 'w') as f:
                    for obj in validation_objects:
                        f.write(json.dumps(obj) + '\n')
                print(f"File written: {file_name_val}, {len(validation_objects)} memes\n")
            else:
                # Create new file names with the destination directory
                file_name = os.path.join(destination_dir, f'{topic}_{mode}.jsonl')
                # Write the combined objects to a new .jsonl file
                with open(file_name, 'w') as f:
                    for obj in combined_objects:
                        f.write(json.dumps(obj) + '\n')
                print(f"File written: {file_name}, {len(combined_objects)} memes")

create_class_files(train_json, count_keywords(train_json, keywords, verbosity = 0), destination_dir, "train");
create_class_files(test_unseen_json, count_keywords(test_unseen_json, keywords, verbosity = 0), destination_dir, "test");

File written: hateful_memes/Womens_train.jsonl, 838 memes
File written: hateful_memes/Womens_val.jsonl, 210 memes

File written: hateful_memes/Africans_train.jsonl, 595 memes
File written: hateful_memes/Africans_val.jsonl, 149 memes

File written: hateful_memes/Muslims_train.jsonl, 531 memes
File written: hateful_memes/Muslims_val.jsonl, 133 memes

File written: hateful_memes/Womens_test.jsonl, 204 memes
File written: hateful_memes/Africans_test.jsonl, 108 memes
File written: hateful_memes/Muslims_test.jsonl, 94 memes


In [8]:
refined_keywords = {
    "Womens"  : ["woman", "she", "her", "female", "feminine", "lady", "girl", "feminist", "queen", "mother", "daughter"],
    "Africans": ["black", "afro", "african", "ebony", "panafrican", "afroamerican", "nubian", "melanin"],
    "Muslims" : ["muslim", "arab", "coran", "islam", "islamic", "muslimah", "hijab", "sunni", "shiite", "halal", "mosque"],
}

# count_keywords("hateful_memes/Womens_train.jsonl", refined_keywords, verbosity = 1);
# count_keywords("hateful_memes/Womens_val.jsonl", refined_keywords, verbosity = 1);
# count_keywords("hateful_memes/Womens_test.jsonl", refined_keywords, verbosity = 1);

# count_keywords("hateful_memes/Africans_train.jsonl", refined_keywords, verbosity = 1);
# count_keywords("hateful_memes/Africans_val.jsonl", refined_keywords, verbosity = 1);
# count_keywords("hateful_memes/Africans_test.jsonl", refined_keywords, verbosity = 1);

count_keywords("hateful_memes/Muslims_train.jsonl", refined_keywords, verbosity = 1);
count_keywords("hateful_memes/Muslims_val.jsonl", refined_keywords, verbosity = 1);
count_keywords("hateful_memes/Muslims_test.jsonl", refined_keywords, verbosity = 1);

FileNotFoundError: [Errno 2] No such file or directory: 'hateful_memes/Womens.jsonl'

In [6]:
# from PIL import Image
# import requests
# from transformers import CLIPProcessor, CLIPModel

# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# image = Image.open("hateful_memes/img/01235.png")

# #show the image
# #image.show()

# inputs = processor(text=["hateful meme", "not hateful meme"], images=[image], return_tensors="pt", padding=True)

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

# # Get the list of texts
# texts = ["hateful meme", "not hateful meme"]

# # Convert the probabilities tensor to a list
# probs_list = probs.tolist()[0]

# # Print the probabilities associated with each text
# for i in range(len(texts)):
#     print(f"The probability that the image is '{texts[i]}' is {probs_list[i]}")

In [7]:
# from PIL import Image
# from transformers import CLIPProcessor, CLIPModel

# # Initialize the model and processor
# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# # Load the image
# image = Image.open("hateful_memes/img/01235.png")

# # Retrieve the caption for the meme
# # This is a placeholder function. Replace it with your actual method to fetch the caption.
# def get_meme_caption(meme_id):
#     # Example: return "This is a caption for a meme"
#     return "when you're feeling horny asf but your habibi is on periods let's try a goat"

# # Example meme ID
# meme_id = "01235"
# caption = get_meme_caption(meme_id)

# # Combine the caption with the predefined descriptions
# input_texts = ["hateful meme with caption: when you're feeling horny asf but your habibi is on periods let's try a goat", "not hateful meme with caption: when you're feeling horny asf but your habibi is on periods let's try a goat"]

# # Process the inputs
# inputs = processor(text=input_texts, images=[image], return_tensors="pt", padding=True)

# # Perform the classification
# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image
# probs = logits_per_image.softmax(dim=1)

# # Convert the probabilities tensor to a list
# probs_list = probs.tolist()[0]

# # Print the probabilities associated with each text
# texts = input_texts
# for i in range(len(texts)):
#     print(f"The probability that the image is '{texts[i]}' is {probs_list[i]}")

In [8]:
# import requests
# from PIL import Image
# from transformers import AutoProcessor, Blip2ForConditionalGeneration

# processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

# device = "mps"
# model.to(device)

# inputs = processor(image, return_tensors="pt").to(device, torch.float16)

# generated_ids = model.generate(**inputs, max_new_tokens=20)
# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
# print(generated_text)

In [9]:
# from PIL import Image
# import requests
# from transformers import AutoProcessor, FlavaModel

# model = FlavaModel.from_pretrained("facebook/flava-full")
# processor = AutoProcessor.from_pretrained("facebook/flava-full")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

# image.show()

# inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

# outputs = model(**inputs)

# from sklearn.metrics.pairwise import cosine_similarity

# # Your existing code...
# # ...
# image_embeddings = outputs.image_embeddings
# text_embeddings = outputs.text_embeddings
# multimodal_embeddings = outputs.multimodal_embeddings

# outputs.image_embeddings.shape
# torch.Size([1, 197, 768])

# text_embeddings.shape
# torch.Size([1, 7, 768])

# multimodal_embeddings.shape
# torch.Size([1, 205, 768])

# # Calculate the mean of the embeddings across the sequence length dimension
# mean_image_embeddings = image_embeddings.mean(dim=1)
# mean_text_embeddings = text_embeddings.mean(dim=1)

# # Calculate the cosine similarity
# similarity = cosine_similarity(mean_image_embeddings.detach().numpy(), mean_text_embeddings.detach().numpy())

# print(f"Cosine similarity between the text and the image: {similarity[0][0]}")