In [None]:
!pip install pyunpack
!pip install patool



In [None]:
# imports
import os
import torch
import torch.nn as nn
import json
import csv
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image, read_file

In [None]:
# mount google drive files
from google.colab import drive
drive.mount('/content/drive')
dataset_path = 'drive/MyDrive/DL_project/dataset_out'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# from pyunpack import Archive
# Archive('drive/MyDrive/DL_project/dataset.7z').extractall('drive/MyDrive/DL_project/')
from zipfile import ZipFile
with ZipFile('drive/MyDrive/DL_project/dataset_out/dataset/img_txt.zip', 'r') as zipObj:
  zipObj.extractall('drive/MyDrive/DL_project/dataset_out/dataset')
# unzip_without_overwrite('drive/MyDrive/DL_project/dataset.7z','drive/MyDrive/DL_project/')

In [None]:
# check to see if we have good number of pictures
dir_path = r'drive/MyDrive/DL_project/dataset_out/dataset/img_resized'
# print(os.listdir(dir_path))
count = 0
for item in os.listdir(dir_path):
    # check if current path is a file
    count += 1
print('File count:', count)

File count: 70374


In [None]:
anns = json.load(open("drive/MyDrive/DL_project/dataset_out/dataset/MMHS150K_GT.json","r"))
print("Loaded anns: " + str(len(anns)))

majority_not_hate = 0
majority_hate = 0
majority_racist = 0
majority_sexist = 0
majority_homo = 0
majority_religion = 0
majority_other = 0

for k,v in anns.items():
    labels = []
    label_num = []
    # print(len(v["labels_str"]))
    for label in v["labels_str"]:
        if "Not" in label:
            label_num.append(0)
        elif "Racist" in label:
            label_num.append(1)
        elif "Sexist" in label:
            label_num.append(2)
        elif "Homo" in label:
            label_num.append(3)
        elif "Religion" in label:
            label_num.append(4)
        elif "Other" in label:
            label_num.append(5)
        else:
            print("Error with: " + label)
            label = "Error"

    if label_num.count(0) > 1:
        majority_not_hate+=1
    else:
        majority_hate+=1
        if label_num.count(1) > 1:
            majority_racist+=1
        elif label_num.count(2) > 1:
            majority_sexist+=1
        elif label_num.count(3) > 1:
            majority_homo+=1
        elif label_num.count(4) > 1:
            majority_religion+=1
        elif label_num.count(5) > 1:
            majority_other+=1

print("Total Tweets Majority Voting: Not Hate: " + str(majority_not_hate) + ", Hate: " + str(majority_hate) + ", Racist: " + str(majority_racist) + ", Sexist: " + str(majority_sexist) + ", Homophobe: " + str(majority_homo) + ", Religion: " + str(majority_religion) + ", Other: " + str(majority_other))

Loaded anns: 149823
Total Tweets Majority Voting: Not Hate: 112845, Hate: 36978, Racist: 11925, Sexist: 3495, Homophobe: 3870, Religion: 163, Other: 5811


In [74]:
image_path = os.path.join(dataset_path, 'dataset/img_resized')
text_path = os.path.join(dataset_path, 'dataset/img_txt')
GT_path = os.path.join(dataset_path, 'dataset/MMHS150K_GT.csv')

In [None]:
def majority_element(arr):
    count_zeros = arr.count(0)
    count_ones = arr.count(1)

    if count_zeros > count_ones:
        return 0
    elif count_ones > count_zeros:
        return 1
    else:
        return None  # No majority element

def hateful_or_not(labels):
    labels_list = labels.copy()
    for label_id in range(len(labels_list)):
        if labels_list[label_id] != 0:
            labels_list[label_id] = 1

    return majority_element(labels_list)

def create_csv_labels(json_file, csv_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    with open(csv_file, 'w', newline='') as csvfile:
        fieldnames = ['user_id', 'labels', 'hateful_label', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Iterate over each user ID in the JSON file
        for user_id, user_data in data.items():
            labels = user_data.get('labels', [])
            text = user_data.get('tweet_text', [])
            hateful_label = hateful_or_not(labels)
            # Write data to CSV file
            writer.writerow({'user_id': user_id, 'labels': labels, 'hateful_label': hateful_label, 'text': text})

In [None]:
# uncomment the first time
create_csv_labels(dataset_path + '/dataset/MMHS150K_GT.json', GT_path)

In [76]:
class MMHS_150KDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, GT_path, image_path, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.GT_path = GT_path
        self.GT_data = pd.read_csv(GT_path)
        self.idx_list = []
        # self.root_dir = root_dir
        self.image_path = image_path
        # self.text_path = text_path
        self.transform = transform

        self.refine_images()

    def __len__(self):
        return self.len_samples

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        ID = self.GT_data.iloc[idx, 0]
        # txt_path = os.path.join(self.text_path, str(ID) + '.json').replace("\\","/")
        img_path = os.path.join(self.image_path, str(ID) + '.jpg').replace("\\","/")
        print(img_path)
        # print(txt_path)

        try:
          image = read_image(img_path)
        except:
          pass
        # f = open(text_path, 'r')
        # data = json.load(f)
        # text = data['img_text']
        # text = f.read()

        # text = read_file(txt_path)

        text = self.GT_data.iloc[idx, 3]

        label = self.GT_data.iloc[idx, 2]

        # sample = {'ID': ID, 'text': text, 'label': label}
        sample = {'ID': ID, 'text': text, 'image': image, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

    def refine_images(self):
      GT_path_cleared = os.path.join(dataset_path, 'dataset/MMHS150K_GT_cleared.csv')
      cmpt = 0
      with open(GT_path, 'r') as readFile:
        lines = readFile.readlines()
        for row in lines[1:]:
          idx = row[0:19]
          img_path = os.path.join(self.image_path, str(idx) + '.jpg').replace("\\","/")
          # look if the image exists, otherwise delete the idx from the csv
          if not (os.path.isfile(img_path)):
            lines.remove(row)
            cmpt = cmpt + 1

      with open(GT_path_cleared, 'w') as writeFile:
        for line in lines:
          writeFile.write(line)

      self.len_samples = len(lines)
      self.GT_data =  pd.read_csv(GT_path_cleared)
      # print(f"nbr_missing = {cmpt}")
      # print(f"len = {len(self.GT_data.iloc[:, 0])}")
      # print(f"linelen = {len(lines)}")

# example = MMHS_150KDataset(GT_path, image_path)
# example[10000]

In [77]:
from torch.utils.data.sampler import SubsetRandomSampler

dataset = MMHS_150KDataset(GT_path, image_path)
batch_size = 16
validation_split = .1
test_split = .5 # corresponds to half ot the validation set
shuffle_dataset = True
random_seed= 42

# Creating data indices for training and validation splits:
dataset_size = len(dataset)
print(dataset_size)
indices = list(range(dataset_size))
validation_split = int(np.floor(validation_split * dataset_size))
test_split = int(np.floor(test_split * validation_split))
print(validation_split, test_split)
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices, test_indices = indices[validation_split:], indices[test_split:validation_split], indices[:test_split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=test_sampler)

70328
7032 3516


In [None]:
# class MMHS150KDataset(Dataset):
#     """MMHS150K dataset."""

#     def __init__(self, label_id_path, txt_path, images_path, transform=None):
#         """
#         Arguments:
#             label_id_path (string): Path to the MMHS150K_GT.json which contains
#                                     the IDs and labels corresponding
#             txt_path (string): Path to the text file with annotations.
#             images_path (string): Directory with all the images.
#             transform (callable, optional): Optional transform to be applied
#                 on a sample.
#         """
#         self.label_id_path = label_id_path
#         self.txt_path = txt_path
#         self.images_path = images_path
#         self.transform = transform