In [10]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import os
from PIL import Image
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split, WeightedRandomSampler
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score 
import shutil
import pandas as pd
import json

# step 1: Preprocessing and Data Loading

In [18]:
batch_size = 5
train_split = 0.8
val_split = 0.1
test_split = 0.1
dataset_path = ''  # Set your dataset path here

image_path = os.path.join(dataset_path, 'dataset/img_resized')
img_text_path = os.path.join(dataset_path, 'dataset/img_txt')
json_path = os.path.join(dataset_path, 'dataset/MMHS150K_GT.json')
GT_path = os.path.join(dataset_path, 'dataset/MMHS150K_Custom.csv')
split_save_path = os.path.join(dataset_path, 'dataset/splits')
os.makedirs(split_save_path, exist_ok=True)

# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, data, image_dir):
        self.data = data
        self.image_dir = image_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_filename, label = self.data[idx]
        img_path = os.path.join(self.image_dir, str(img_filename) + ".jpg")  # Add .jpg extension
        return img_path, label

# Load your dataset from CSV
df = pd.read_csv(GT_path)
image_filenames = df['user_id'].tolist()
labels = df['hateful_label'].tolist()
dataset = list(zip(image_filenames, labels))

dataset_size = len(dataset)
print(f"Dataset size: {dataset_size}")

# Split dataset into training, validation, and test sets
train_size = int(train_split * dataset_size)
val_size = int(val_split * dataset_size)
test_size = dataset_size - train_size - val_size
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

# Save the dataset splits in a serializable format
def save_split(dataset_split, filename):
    serializable_split = [(img_filename, int(label)) for img_filename, label in dataset_split]
    with open(os.path.join(split_save_path, filename), 'w') as f:
        json.dump(serializable_split, f)

save_split(train_set, 'train_set.json')
save_split(val_set, 'val_set.json')
save_split(test_set, 'test_set.json')

# Load the dataset splits
def load_split(filename):
    with open(os.path.join(split_save_path, filename), 'r') as f:
        loaded_split = json.load(f)
    return [(img_filename, label) for img_filename, label in loaded_split]

train_set = load_split('train_set.json')
val_set = load_split('val_set.json')
test_set = load_split('test_set.json')

# Create data loaders for the splits
def create_dataloader(dataset_split, image_dir, batch_size, shuffle=True, sampler=None):
    dataset = CustomDataset(dataset_split, image_dir)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler)

train_loader = create_dataloader(train_set, image_path, batch_size)
validation_loader = create_dataloader(val_set, image_path, batch_size)
test_loader = create_dataloader(test_set, image_path, batch_size)

# Calculate class weights for the training set
total_counts = [label for _, label in train_set]
total_not_hate = total_counts.count(0)
total_hate = total_counts.count(1)
total_samples = len(train_set)

# Inverse of the frequency of each class
class_weights = [total_samples / total_not_hate, total_samples / total_hate]

# Apply class weights to training set
weights_train = [class_weights[label] for _, label in train_set]
sampler_train = WeightedRandomSampler(weights_train, len(weights_train))

# Create data loader for balanced training set
train_loader_balanced = DataLoader(CustomDataset(train_set, image_path), batch_size=batch_size, sampler=sampler_train)

# Move images to respective directories
def move_images(dataset_split, split_name):
    split_dir = os.path.join(dataset_path, 'dataset', split_name)
    os.makedirs(split_dir, exist_ok=True)
    for img_filename, label in dataset_split:
        img_path = os.path.join(image_path, str(img_filename) + ".jpg")  # Add .jpg extension
        shutil.copy(img_path, split_dir)

# Move images after creating splits and samplers
move_images(train_set, 'train')
move_images(val_set, 'val')
move_images(test_set, 'test')

Dataset size: 59245


In [None]:
dataset_path = ''  # Update this path
train_folder = os.path.join(dataset_path, 'val')
csv_path = os.path.join(dataset_path, 'dataset/MMHS150K_Custom.csv')

# Create directories for hate and not_hate
hate_folder = os.path.join(train_folder, 'hate')
not_hate_folder = os.path.join(train_folder, 'not_hate')
os.makedirs(hate_folder, exist_ok=True)
os.makedirs(not_hate_folder, exist_ok=True)

# Read the CSV file
df = pd.read_csv(csv_path)

# Convert image IDs to the format they are saved with (i.e., add '.jpg')
image_names_in_csv = {f"{str(image_id)}.jpg" for image_id in df['user_id'].values}

# Iterate over all images in the train folder
for image_name in os.listdir(train_folder):
    # Skip the hate and not_hate folders if they exist in train_folder
    if image_name in ['hate', 'not_hate']:
        continue

    image_path = os.path.join(train_folder, image_name)

    if image_name in image_names_in_csv:
        # Get the label for the image from the CSV (by removing the '.jpg' part)
        image_id = image_name[:-4]  # Removing '.jpg'
        label = df[df['user_id'] == int(image_id)]['hateful_label'].values[0]
        print(label)

        # Move the image to the corresponding folder
        if label == 1:
            shutil.move(image_path, os.path.join(hate_folder, image_name))
        else:
            shutil.move(image_path, os.path.join(not_hate_folder, image_name))
    else:
        # Remove the image if it is not in the CSV
        os.remove(image_path)
        print(f"Removed image {image_name} as it is not in the CSV file.")

print("Processing complete: Images have been moved to 'hate' and 'not_hate' folders or removed if not listed in the CSV.")

# step 2: Model building

In [21]:
!pip install ultralytics




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from pathlib import Path

DATA_DIR = Path(r'C:\Users\Sama Wael\OneDrive\Desktop\Epfl\MA2\Deep Learning\DL_Project\data\hateful_dataset')

In [23]:
from ultralytics import YOLO
import ultralytics
ultralytics.checks()

#Load Model
model = YOLO('yolov8-cls.yaml')

#Use Model
results = model.train(data = DATA_DIR, epochs = 15, imgsz = 64)   ## Train the Model

Ultralytics YOLOv8.2.18  Python-3.11.5 torch-2.3.0+cpu CPU (Intel Core(TM) i9-10980HK 2.40GHz)
Setup complete  (16 CPUs, 31.8 GB RAM, 934.8/952.5 GB disk)
YOLOv8-cls summary: 99 layers, 2719288 parameters, 2719288 gradients, 4.4 GFLOPs
Ultralytics YOLOv8.2.18  Python-3.11.5 torch-2.3.0+cpu CPU (Intel Core(TM) i9-10980HK 2.40GHz)
[34m[1mengine\trainer: [0mtask=classify, mode=train, model=yolov8-cls.yaml, data=C:\Users\Sama Wael\OneDrive\Desktop\Epfl\MA2\Deep Learning\DL_Project\data\hateful_dataset, epochs=15, time=None, patience=100, batch=16, imgsz=64, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train11, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=





[34m[1mtrain: [0mScanning C:\Users\Sama Wael\OneDrive\Desktop\Epfl\MA2\Deep Learning\DL_Project\data\hateful_dataset\train... 47396 images, 0 corrupt: 100%|██████████| 47396/47396 [00:00<?, ?it/s]
[34m[1mval: [0mScanning C:\Users\Sama Wael\OneDrive\Desktop\Epfl\MA2\Deep Learning\DL_Project\data\hateful_dataset\val... 5924 images, 0 corrupt: 100%|██████████| 5924/5924 [00:00<?, ?it/s]

[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.9) with parameter groups 26 weight(decay=0.0), 27 weight(decay=0.0005), 27 bias(decay=0.0)
Image sizes 64 train, 64 val
Using 0 dataloader workers
Logging results to [1mruns\classify\train11[0m
Starting training for 15 epochs...

      Epoch    GPU_mem       loss  Instances       Size



       1/15         0G     0.6158         16         64:  18%|█▊        | 528/2963 [01:36<08:04,  5.02it/s]