# DeepLearning Hate Speech
This notebook contains our project implementation, the instructions to run the notebook and explanations.

#### 1.1 Imports

In [1]:
import os
import json
import random

In [2]:
# Define the directory paths
destination_dir  = "hateful_memes/"
train_json       = "hateful_memes/train.jsonl"
test_seen_json   = "hateful_memes/test_seen.jsonl"
test_unseen_json = "hateful_memes/test_unseen.jsonl"

In [3]:
keywords = {
    "Africans" : ["black", "white", "afro", "african", "ebony", "panafrican", "afroamerican", "nubian", "melanin", "slave", "slavery", "field", "cotton"],
    "Womens"   : ["woman", "she", "her", "female", "feminine", "lady", "girl", "feminist", "queen", "mother", "daughter"],
    "Muslims"  : ["muslim", "arab", "coran", "islam", "islamic", "muslimah", "hijab", "sunni", "shiite", "halal", "mosque", "goat", "habibi"],
    "LGBTQ"    : ["lgbt", "gay", "homo", "lesbian", "bisexual", "transgender", "queer", "pride", "rainbow", "drag", "trans", "nonbinary"],
    "Jews"     : ["jewish", "jew", "hebrew", "israeli", "yiddish", "semitic", "kosher", "rabbi"],
    "Politics" : ["activist", "protest", "demonstration", "rally", "activism", "campaign", "cause", "resist", "justice", "rights"],
    "Asians"   : ["asian", "chinese", "oriental", "eastasian", "japanese", "korean", "vietnamese", "filipino", "anime", "manga"],
    "Disable"  : ["disabled", "retarded", "handicap", "disability", "impaired", "specialneeds", "wheelchair", "autism", "dyslexia"],
    "Homeless" : ["homeless", "poor", "beggar", "vagrant", "homelessness", "poverty", "shelter", "unemployed", "panhandler"],
    "Natives " : ["indigenous", "nativeamerican", "firstnations", "aboriginal", "tribal", "native", "reservation", "tribe"],
    "Latinos " : ["latino", "hispanic", "chicano", "mestizo", "latina", "latinx", "boricua", "tejano"],
}

def count_keywords(file_path, keywords, verbosity = 0):
    """
    This funtion reads a JSON file line by line, and counts the number of occurrences of keywords related to a topic in the text field of the JSON object.
    It also stores the ids of the memes that contain a keyword, counts the number of labels 0 and 1 for each meme found and computes the harm rate.
    Args:
        file_path: str, path to the JSON file
        keywords: dict, dictionary of keywords to search for in the specified file
        verbosity: int, level of verbosity for output
    Output:
        keyword_info: dict, dictionary containing the counts, ids, and label counts for each keyword
    """
    # Initialize a dictionary to store the counts, ids, and label counts
    keyword_info = {key: {'count': 0, 'ids': [], 'harm_rate': 0, 'label_0': 0, 'label_1': 0} for key in keywords.keys()}
    
    # Open the JSON file
    with open(file_path, 'r') as f:
        # Iterate over the lines in the file
        for line in f:
            # Load the JSON object from the line
            obj = json.loads(line)
            # Split the text into words
            words = obj['text'].split()

            # Check each word against the keywords
            for word in words:
                for topic, topic_keywords in keywords.items():
                    if word.lower() in topic_keywords:
                        # Append the id to the corresponding topic in the ids dictionary
                        if obj['id'] not in keyword_info[topic]['ids']:
                            keyword_info[topic]['ids'].append(obj['id'])
                            keyword_info[topic]['count'] += 1
                            # Count the labels
                            if obj['label'] == 0:
                                keyword_info[topic]['label_0'] += 1
                            elif obj['label'] == 1:
                                keyword_info[topic]['label_1'] += 1
                        break  # Stop checking other keywords once a keyword is found
    # Compute the harm rate
    for topic in keyword_info.keys():
        #check if count is zero to avoid division by zero
        if keyword_info[topic]['count'] == 0:
            harm_rate = 0
        else:
            harm_rate = round(keyword_info[topic]['label_1'] / len(open(file_path).readlines()), 4)
            harm_rate = harm_rate * 100
        keyword_info[topic]['harm_rate'] = harm_rate
    if verbosity > 0:
        print("File:", file_path[14:])
        for topic, info in keyword_info.items():
            print(f"{topic.ljust(8)}:  {str(info['count']).ljust(3)}  Harm Rate:  {info['harm_rate']:.2f}%  Label 0:  {str(info['label_0']).ljust(3)}  Label 1:  {info['label_1']}")
    return keyword_info

count_keywords(train_json, keywords, verbosity = 1);
count_keywords(test_seen_json, keywords, verbosity = 1);
count_keywords(test_unseen_json, keywords, verbosity = 1);

File: train.jsonl
Africans:  688  Harm Rate:  5.28%  Label 0:  239  Label 1:  449
Womens  :  524  Harm Rate:  2.39%  Label 0:  321  Label 1:  203
Muslims :  448  Harm Rate:  3.40%  Label 0:  159  Label 1:  289
LGBTQ   :  150  Harm Rate:  1.33%  Label 0:  37   Label 1:  113
Jews    :  100  Harm Rate:  0.82%  Label 0:  30   Label 1:  70
Politics:  62   Harm Rate:  0.29%  Label 0:  37   Label 1:  25
Asians  :  60   Harm Rate:  0.45%  Label 0:  22   Label 1:  38
Disable :  46   Harm Rate:  0.44%  Label 0:  9    Label 1:  37
Homeless:  31   Harm Rate:  0.14%  Label 0:  19   Label 1:  12
Natives :  15   Harm Rate:  0.05%  Label 0:  11   Label 1:  4
Latinos :  1    Harm Rate:  0.01%  Label 0:  0    Label 1:  1
File: test_seen.jsonl
Africans:  71   Harm Rate:  4.90%  Label 0:  22   Label 1:  49
Womens  :  50   Harm Rate:  2.90%  Label 0:  21   Label 1:  29
Muslims :  45   Harm Rate:  3.00%  Label 0:  15   Label 1:  30
LGBTQ   :  15   Harm Rate:  1.10%  Label 0:  4    Label 1:  11
Jews    :  18

We set the minimum number of memes for a topic to become a class to 300. Thus, only the topics "Womens", "African" and "Muslims" are kept as class.

In [4]:
def create_class_files(file_path, keyword_info, destination_dir, mode):
    """
    This function creates .jsonl files for each topic in the keyword_info dictionary that is big enough.
    It combines the memes associated with the topic with an equal number of memes not associated with the topic.
    args:
        file_path: str, path to the original file
        keyword_info: dict, dictionary containing the keyword information
        destination_dir: str, directory to save the new files
        mode: str, 'train' or 'test'
    returns:
        None
    """
    if mode == "train":
        threshold = 400
    else:
        threshold = 46

    # Load all objects from the original file into a list
    with open(file_path, 'r') as f:
        all_objects = [json.loads(line) for line in f]

    # A set to keep track of ids already used as non-topic objects
    used_non_topic_ids = set()

    # List to store all combined objects from different topics
    all_combined_objects = []

    # List to store objects for the concatenated file
    concatenated_objects = []

    # Iterate over each topic in keyword_info
    for topic, info in keyword_info.items():
        # Check if the count of the topic is greater than the threshold
        if info['count'] >= threshold:
            # Get all objects associated with this topic
            topic_objects = [obj for obj in all_objects if obj['id'] in info['ids']]
            # Get all objects not associated with this topic and not already used as non-topic objects
            non_topic_objects = [obj for obj in all_objects if obj['id'] not in info['ids'] and obj['id'] not in used_non_topic_ids]
            # Randomly select a similar amount of non-topic objects
            non_topic_objects = random.sample(non_topic_objects, len(topic_objects))
            # Force the label of these non-topic objects to 0 for topic-specific files
            for obj in non_topic_objects:
                obj['label'] = 0
                # Add the id to the set of used non-topic ids
                used_non_topic_ids.add(obj['id'])
            # Combine the topic and non-topic objects
            combined_objects = topic_objects + non_topic_objects
            random.shuffle(combined_objects)
            all_combined_objects.extend(combined_objects)
            concatenated_objects.extend(topic_objects)  # Add only original topic objects
            concatenated_objects.extend(non_topic_objects)  # Add only original non-topic objects
            if mode == "train":
                # 80% for training, 20% for validation
                train_size = int(0.8 * len(combined_objects))
                train_objects = combined_objects[:train_size]
                validation_objects = combined_objects[train_size:]
                # Create new file names with the destination directory
                file_name_train = os.path.join(destination_dir, f'{topic}_train.jsonl')
                file_name_val = os.path.join(destination_dir, f'{topic}_val.jsonl')
                # Write the combined objects to a new .jsonl file
                with open(file_name_train, 'w') as f:
                    for obj in train_objects:
                        f.write(json.dumps(obj) + '\n')
                print(f"File written: {file_name_train[14:]}, {len(train_objects)} memes")
                with open(file_name_val, 'w') as f:
                    for obj in validation_objects:
                        f.write(json.dumps(obj) + '\n')
                print(f"File written: {file_name_val[14:]}, {len(validation_objects)} memes\n")
            else:
                # Create new file names with the destination directory
                file_name = os.path.join(destination_dir, f'{topic}_{mode}.jsonl')
                # Write the combined objects to a new .jsonl file
                with open(file_name, 'w') as f:
                    for obj in combined_objects:
                        f.write(json.dumps(obj) + '\n')
                print(f"File written: {file_name[14:]}, {len(combined_objects)} memes")
    
    # Write all concatenated objects to additional train and validation files without modifying labels
    unique_combined_objects = list({obj['id']: obj for obj in concatenated_objects}.values())  # Ensure no duplicates
    random.shuffle(unique_combined_objects)
    if mode == "train":
        train_size = int(0.8 * len(unique_combined_objects))
        train_objects = unique_combined_objects[:train_size]
        validation_objects = unique_combined_objects[train_size:]

        concatenated_train_file_name = os.path.join(destination_dir, f'Base_train.jsonl')
        concatenated_val_file_name = os.path.join(destination_dir, f'Base_val.jsonl')

        with open(concatenated_train_file_name, 'w') as f:
            for obj in train_objects:
                f.write(json.dumps(obj) + '\n')
        print(f"File written: {concatenated_train_file_name[14:]}, {len(train_objects)} memes")

        with open(concatenated_val_file_name, 'w') as f:
            for obj in validation_objects:
                f.write(json.dumps(obj) + '\n')
        print(f"File written: {concatenated_val_file_name[14:]}, {len(validation_objects)} memes")
    else:
        concatenated_file_name = os.path.join(destination_dir, f'Base_{mode}.jsonl')
        with open(concatenated_file_name, 'w') as f:
            for obj in unique_combined_objects:
                f.write(json.dumps(obj) + '\n')
        print(f"File written: {concatenated_file_name[14:]}, {len(unique_combined_objects)} memes")

create_class_files(train_json, count_keywords(train_json, keywords, verbosity = 0), destination_dir, "train");
create_class_files(test_unseen_json, count_keywords(test_unseen_json, keywords, verbosity = 0), destination_dir, "test");

File written: Africans_train.jsonl, 1100 memes
File written: Africans_val.jsonl, 276 memes

File written: Womens_train.jsonl, 838 memes
File written: Womens_val.jsonl, 210 memes

File written: Muslims_train.jsonl, 716 memes
File written: Muslims_val.jsonl, 180 memes

File written: Base_train.jsonl, 2414 memes
File written: Base_val.jsonl, 604 memes
File written: Africans_test.jsonl, 208 memes
File written: Womens_test.jsonl, 204 memes
File written: Muslims_test.jsonl, 122 memes
File written: Base_test.jsonl, 504 memes


In [5]:
refined_keywords = { "Africans" : ["black", "white", "afro", "african", "ebony", "panafrican", "afroamerican", "nubian", "melanin", "slave", "slavery", "field", "cotton"],
                     "Womens"  : ["woman", "she", "her", "female", "feminine", "lady", "girl", "feminist", "queen", "mother", "daughter"],
                     "Muslims" : ["muslim", "arab", "coran", "islam", "islamic", "muslimah", "hijab", "sunni", "shiite", "halal", "mosque", "goat", "habibi"], }

count_keywords("hateful_memes/Base_train.jsonl", refined_keywords, verbosity = 1);
count_keywords("hateful_memes/Base_test.jsonl", refined_keywords, verbosity = 1);

count_keywords("hateful_memes/Africans_train.jsonl", {"Africans":refined_keywords["Africans"]}, verbosity = 1);
count_keywords("hateful_memes/Africans_val.jsonl", {"Africans":refined_keywords["Africans"]}, verbosity = 1);
count_keywords("hateful_memes/Africans_test.jsonl", {"Africans":refined_keywords["Africans"]}, verbosity = 1);

count_keywords("hateful_memes/Womens_train.jsonl", {"Womens":refined_keywords["Womens"]}, verbosity = 1);
count_keywords("hateful_memes/Womens_val.jsonl", {"Womens":refined_keywords["Womens"]}, verbosity = 1);
count_keywords("hateful_memes/Womens_test.jsonl", {"Womens":refined_keywords["Womens"]}, verbosity = 1);

count_keywords("hateful_memes/Muslims_train.jsonl", {"Muslims":refined_keywords["Muslims"]}, verbosity = 1);
count_keywords("hateful_memes/Muslims_val.jsonl", {"Muslims":refined_keywords["Muslims"]}, verbosity = 1);
count_keywords("hateful_memes/Muslims_test.jsonl", {"Muslims":refined_keywords["Muslims"]}, verbosity = 1);

File: Base_train.jsonl
Africans:  546  Harm Rate:  12.68%  Label 0:  240  Label 1:  306
Womens  :  429  Harm Rate:  5.88%  Label 0:  287  Label 1:  142
Muslims :  348  Harm Rate:  8.08%  Label 0:  153  Label 1:  195
File: Base_test.jsonl
Africans:  104  Harm Rate:  10.91%  Label 0:  49   Label 1:  55
Womens  :  102  Harm Rate:  8.33%  Label 0:  60   Label 1:  42
Muslims :  61   Harm Rate:  5.16%  Label 0:  35   Label 1:  26
File: Africans_train.jsonl
Africans:  548  Harm Rate:  32.36%  Label 0:  192  Label 1:  356
File: Africans_val.jsonl
Africans:  140  Harm Rate:  33.70%  Label 0:  47   Label 1:  93
File: Africans_test.jsonl
Africans:  104  Harm Rate:  29.33%  Label 0:  43   Label 1:  61
File: Womens_train.jsonl
Womens  :  417  Harm Rate:  16.47%  Label 0:  279  Label 1:  138
File: Womens_val.jsonl
Womens  :  107  Harm Rate:  20.95%  Label 0:  63   Label 1:  44
File: Womens_test.jsonl
Womens  :  102  Harm Rate:  20.59%  Label 0:  60   Label 1:  42
File: Muslims_train.jsonl
Muslims : 

In [6]:
# from PIL import Image
# import requests
# from transformers import CLIPProcessor, CLIPModel

# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# image = Image.open("hateful_memes/img/01235.png")

# #show the image
# #image.show()

# inputs = processor(text=["hateful meme", "not hateful meme"], images=[image], return_tensors="pt", padding=True)

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

# # Get the list of texts
# texts = ["hateful meme", "not hateful meme"]

# # Convert the probabilities tensor to a list
# probs_list = probs.tolist()[0]

# # Print the probabilities associated with each text
# for i in range(len(texts)):
#     print(f"The probability that the image is '{texts[i]}' is {probs_list[i]}")

In [7]:
# from PIL import Image
# import requests
# from transformers import AutoProcessor, FlavaModel

# model = FlavaModel.from_pretrained("facebook/flava-full")
# processor = AutoProcessor.from_pretrained("facebook/flava-full")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

# image.show()

# inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

# outputs = model(**inputs)

# from sklearn.metrics.pairwise import cosine_similarity

# # Your existing code...
# # ...
# image_embeddings = outputs.image_embeddings
# text_embeddings = outputs.text_embeddings
# multimodal_embeddings = outputs.multimodal_embeddings

# outputs.image_embeddings.shape
# torch.Size([1, 197, 768])

# text_embeddings.shape
# torch.Size([1, 7, 768])

# multimodal_embeddings.shape
# torch.Size([1, 205, 768])

# # Calculate the mean of the embeddings across the sequence length dimension
# mean_image_embeddings = image_embeddings.mean(dim=1)
# mean_text_embeddings = text_embeddings.mean(dim=1)

# # Calculate the cosine similarity
# similarity = cosine_similarity(mean_image_embeddings.detach().numpy(), mean_text_embeddings.detach().numpy())

# print(f"Cosine similarity between the text and the image: {similarity[0][0]}")