In [1]:
# imports
import os
import torch
import torch.nn as nn
import json
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image, read_file

In [2]:
dataset_path = ''

In [3]:
# check to see if we have good number of pictures
dir_path = r'dataset/img_resized'
# print(os.listdir(dir_path))
count = 0
for item in os.listdir(dir_path):
    # check if current path is a file
    count += 1
print('File count:', count)

File count: 150000


In [4]:
anns = json.load(open("dataset/MMHS150K_GT.json","r"))
print("Loaded anns: " + str(len(anns)))

majority_not_hate = 0
majority_hate = 0
majority_racist = 0
majority_sexist = 0
majority_homo = 0
majority_religion = 0
majority_other = 0

for k,v in anns.items():
    labels = []
    label_num = []
    # print(len(v["labels_str"]))
    for label in v["labels_str"]:
        if "Not" in label:
            label_num.append(0)
        elif "Racist" in label:
            label_num.append(1)
        elif "Sexist" in label:
            label_num.append(2)
        elif "Homo" in label:
            label_num.append(3)
        elif "Religion" in label:
            label_num.append(4)
        elif "Other" in label:
            label_num.append(5)
        else:
            print("Error with: " + label)
            label = "Error"

    if label_num.count(0) > 1:
        majority_not_hate+=1
    else:
        majority_hate+=1
        if label_num.count(1) > 1:
            majority_racist+=1
        elif label_num.count(2) > 1:
            majority_sexist+=1
        elif label_num.count(3) > 1:
            majority_homo+=1
        elif label_num.count(4) > 1:
            majority_religion+=1
        elif label_num.count(5) > 1:
            majority_other+=1

print("Total Tweets Majority Voting: Not Hate: " + str(majority_not_hate) + ", Hate: " + str(majority_hate) + ", Racist: " + str(majority_racist) + ", Sexist: " + str(majority_sexist) + ", Homophobe: " + str(majority_homo) + ", Religion: " + str(majority_religion) + ", Other: " + str(majority_other))

Loaded anns: 149823
Total Tweets Majority Voting: Not Hate: 112845, Hate: 36978, Racist: 11925, Sexist: 3495, Homophobe: 3870, Religion: 163, Other: 5811


In [5]:
image_path = os.path.join(dataset_path, 'dataset/img_resized')
text_path = os.path.join(dataset_path, 'dataset/img_txt')
GT_path = os.path.join(dataset_path, 'dataset/MMHS150K_GT.csv')

In [6]:
from preprocessing import create_csv_labels
# uncomment the first time
create_csv_labels(os.path.join(dataset_path, 'dataset/MMHS150K_GT.json'), GT_path)

In [11]:
class MMHS_150KDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, GT_path, image_path, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.GT_path = GT_path
        self.GT_data = pd.read_csv(GT_path)
        self.idx_list = []
        # self.root_dir = root_dir
        self.image_path = image_path
        # self.text_path = text_path
        self.transform = transform

        self.refine_images()

    def __len__(self):
        return self.len_samples

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        ID = self.GT_data.iloc[idx, 0]
        # txt_path = os.path.join(self.text_path, str(ID) + '.json').replace("\\","/")
        img_path = os.path.join(self.image_path, str(ID) + '.jpg').replace("\\","/")
        print(img_path)
        # print(txt_path)

        try:
          image = read_image(img_path)
        except:
          pass
        # f = open(text_path, 'r')
        # data = json.load(f)
        # text = data['img_text']
        # text = f.read()

        # text = read_file(txt_path)

        text = self.GT_data.iloc[idx, 3]

        label = self.GT_data.iloc[idx, 2]

        # sample = {'ID': ID, 'text': text, 'label': label}
        sample = {'ID': ID, 'text': text, 'image': image, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

    def refine_images(self):
      GT_path_cleared = os.path.join(dataset_path, 'dataset/MMHS150K_GT_cleared.csv')
      cmpt = 0
      with open(GT_path, 'r') as readFile:
        lines = readFile.readlines()
        for row in lines[1:]:
          idx = row[0:19]
          img_path = os.path.join(self.image_path, str(idx) + '.jpg').replace("\\","/")
          # look if the image exists, otherwise delete the idx from the csv
          if not (os.path.isfile(img_path)):
            lines.remove(row)
            cmpt = cmpt + 1

      with open(GT_path_cleared, 'w') as writeFile:
        for line in lines:
          writeFile.write(line)

      self.len_samples = len(lines)
      self.GT_data =  pd.read_csv(GT_path_cleared)
      # print(f"nbr_missing = {cmpt}")
      # print(f"len = {len(self.GT_data.iloc[:, 0])}")
      # print(f"linelen = {len(lines)}")

# example = MMHS_150KDataset(GT_path, image_path)
# example[1000]

dataset/img_resized/1110145718201651200.jpg


{'ID': 1110145718201651200,
 'text': "it's time to work out mah nigga❤️ <url>",
 'image': tensor([[[ 10,  10,   7,  ..., 178, 190, 189],
          [  7,   9,   7,  ..., 163, 178, 178],
          [  4,   7,   6,  ..., 174, 179, 181],
          ...,
          [ 17,  15,  14,  ..., 208, 217, 215],
          [ 19,  18,  17,  ..., 212, 211, 209],
          [ 24,  23,  21,  ..., 212, 203, 201]],
 
         [[ 32,  32,  31,  ..., 156, 166, 165],
          [ 29,  31,  31,  ..., 141, 154, 154],
          [ 26,  29,  30,  ..., 152, 155, 157],
          ...,
          [ 17,  15,  14,  ..., 150, 159, 157],
          [ 19,  18,  17,  ..., 154, 153, 151],
          [ 24,  23,  21,  ..., 154, 145, 143]],
 
         [[ 45,  45,  43,  ..., 145, 154, 153],
          [ 42,  44,  43,  ..., 130, 142, 142],
          [ 39,  42,  42,  ..., 141, 143, 145],
          ...,
          [ 17,  15,  14,  ...,  77,  86,  84],
          [ 19,  18,  17,  ...,  81,  80,  78],
          [ 24,  23,  21,  ...,  81,  72,  7

In [8]:
from torch.utils.data.sampler import SubsetRandomSampler

dataset = MMHS_150KDataset(GT_path, image_path)
batch_size = 16
validation_split = .1
test_split = .5 # corresponds to half ot the validation set
shuffle_dataset = True
random_seed= 42

# Creating data indices for training and validation splits:
dataset_size = len(dataset)
print(dataset_size)
indices = list(range(dataset_size))
validation_split = int(np.floor(validation_split * dataset_size))
test_split = int(np.floor(test_split * validation_split))
print(validation_split, test_split)
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices, test_indices = indices[validation_split:], indices[test_split:validation_split], indices[:test_split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=test_sampler)

149824
14982 7491


In [9]:
# class MMHS150KDataset(Dataset):
#     """MMHS150K dataset."""

#     def __init__(self, label_id_path, txt_path, images_path, transform=None):
#         """
#         Arguments:
#             label_id_path (string): Path to the MMHS150K_GT.json which contains
#                                     the IDs and labels corresponding
#             txt_path (string): Path to the text file with annotations.
#             images_path (string): Directory with all the images.
#             transform (callable, optional): Optional transform to be applied
#                 on a sample.
#         """
#         self.label_id_path = label_id_path
#         self.txt_path = txt_path
#         self.images_path = images_path
#         self.transform = transform