In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import os
from PIL import Image
import pandas as pd
from torch.utils.data.sampler import WeightedRandomSampler
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score 

from preprocessing import create_csv_labels
from custom_dataset import CustomDataset
from vilbert_adapt import CustomBert
from utils import *

In [2]:
on_colab = False
create_csv = False

# step 1: preprocessing and data loading

In [3]:
if on_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    dataset_path = 'drive/MyDrive/DL_project'
else:
    dataset_path = ''

# Load dataset
image_path = os.path.join(dataset_path, 'dataset/img_resized')
img_text_path = os.path.join(dataset_path, 'dataset/img_txt')
json_path = os.path.join(dataset_path, 'dataset/MMHS150K_GT.json')
GT_path = os.path.join(dataset_path, 'dataset/MMHS150K_Custom.csv')

In [4]:
# Create cleaned csv file
if create_csv:
    filename = os.path.join(dataset_path, "dataset/MMHS1150K_Custom.csv")
    create_csv_labels(json_path, filename, img_text_path)
    GT_path = filename

In [4]:
# Define transformations for image preprocessing
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalizing using ImageNet statistics
])

dataset = CustomDataset(GT_path, image_path, img_text_path, transform=data_transforms)




In [5]:
# visual inspection
# dataset[3]

now


(tensor([  101,  1026, 23325, 15900,  1028, 24761,  6508,  6904, 13871,  4140,
          1026, 24471,  2140,  1028,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [7]:
# # Define hyperparameters -------------------------------------------------------
# import numpy as np
# batch_size = 5

# # ------------------------------------------------------------------------------

# # Split dataset into training, validation, and test sets
# dataset_size = len(dataset)
# print(dataset_size)
# train_set, test_set, val_set = torch.utils.data.dataset.random_split(dataset, [0.8, 0.1, 0.1])
# # train_set = np.floor(train_set)
# # test_set = round(test_set)
# # val_set = round(val_set)

# print(train_set) 
# print(test_set) 
# print(val_set)

# # Create data loader for training set
# not_hate_indices = []
# hate_indices = []
# for idx in range(len(train_set)):
#     if train_set[idx][5] == 1:
#         hate_indices.append(idx)
#     else:
#         not_hate_indices.append(idx)

# num_not_hate = len(not_hate_indices)
# num_hate = len(hate_indices)
# total_samples = num_not_hate + num_hate

# # Create a WeightedRandomSampler to balance the training dataset
# class_weights = [1-num_hate/total_samples, 1-num_not_hate/ total_samples]  # Inverse of number of samples per class

# weights = []
# for idx in range(len(train_set)):
#     try:
#         label = dataset[idx][5]
#         according_weights = class_weights[int(label)]
#         weights.append(according_weights)
#     except:
#         print(f"Error with idx: {idx}")
#         print(f"Label: {dataset[idx][5]}")

# # weights = [class_weights[int(dataset[idx]['label'])] for idx in train_indices]
# sampler = WeightedRandomSampler(weights, len(weights))

# # Create data loader for balanced training set
# train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=sampler)

# # Create data loaders for validation and test sets
# validation_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size)
# test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)


6
<torch.utils.data.dataset.Subset object at 0x00000285C7B86110>
<torch.utils.data.dataset.Subset object at 0x00000285C7D5FAD0>
<torch.utils.data.dataset.Subset object at 0x00000285C7D5C890>
now
now
now




now
now
now
now
now
now
now
here


In [9]:
import os
import shutil
import torch
from torch.utils.data import Dataset, DataLoader, random_split, WeightedRandomSampler
import json

# Define hyperparameters
batch_size = 5
dataset_path = ''  # Set your dataset path here

image_path = os.path.join(dataset_path, 'dataset/img_resized')
split_save_path = os.path.join(dataset_path, 'dataset/splits')
os.makedirs(split_save_path, exist_ok=True)

# Assuming a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Load your dataset
dataset = [os.path.join(image_path, fname) for fname in os.listdir(image_path)]
labels = [0 if 'not_hate' in fname else 1 for fname in os.listdir(image_path)]
dataset = list(zip(dataset, labels))

dataset_size = len(dataset)
print(f"Dataset size: {dataset_size}")

# Split dataset into training, validation, and test sets
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

# Function to move images to respective directories
def move_images(dataset_split, split_name):
    split_dir = os.path.join(dataset_path, split_name)
    os.makedirs(split_dir, exist_ok=True)
    for img_path, label in dataset_split:
        shutil.copy(img_path, split_dir)

# Move images to respective directories
move_images(train_set, 'train')
move_images(val_set, 'val')
move_images(test_set, 'test')

# Save the dataset splits in a serializable format
def save_split(dataset_split, filename):
    serializable_split = [(img_path, int(label)) for img_path, label in dataset_split]
    with open(os.path.join(split_save_path, filename), 'w') as f:
        json.dump(serializable_split, f)

save_split(train_set, 'train_set.json')
save_split(val_set, 'val_set.json')
save_split(test_set, 'test_set.json')

# Load the dataset splits
def load_split(filename):
    with open(os.path.join(split_save_path, filename), 'r') as f:
        loaded_split = json.load(f)
    return [(img_path, label) for img_path, label in loaded_split]

train_set = load_split('train_set.json')
val_set = load_split('val_set.json')
test_set = load_split('test_set.json')

# Create data loaders for the splits
class CustomImageDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        # Here, you would load and process the image
        # For now, return a placeholder tuple (img_path, label)
        return img_path, label

def create_dataloader(dataset_split, batch_size, shuffle=True, sampler=None):
    dataset = CustomImageDataset(dataset_split)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler)

train_loader = create_dataloader(train_set, batch_size)
validation_loader = create_dataloader(val_set, batch_size)
test_loader = create_dataloader(test_set, batch_size)

# Create a WeightedRandomSampler to balance the training dataset
not_hate_indices = [i for i, (_, label) in enumerate(train_set) if label == 0]
hate_indices = [i for i, (_, label) in enumerate(train_set) if label == 1]

num_not_hate = len(not_hate_indices)
num_hate = len(hate_indices)
total_samples = num_not_hate + num_hate

class_weights = [1 - num_hate / total_samples, 1 - num_not_hate / total_samples]  # Inverse of number of samples per class
weights = [class_weights[label] for _, label in train_set]
sampler = WeightedRandomSampler(weights, len(weights))

# Save weights
with open(os.path.join(split_save_path, 'weights.json'), 'w') as f:
    json.dump(weights, f)

# Load weights
with open(os.path.join(split_save_path, 'weights.json'), 'r') as f:
    weights = json.load(f)

sampler = WeightedRandomSampler(weights, len(weights))

# Create data loader for balanced training set
train_loader_balanced = DataLoader(CustomImageDataset(train_set), batch_size=batch_size, sampler=sampler)

# Use train_loader_balanced for training

Dataset size: 150000


In [13]:
import os
import shutil
import pandas as pd

# Define paths
dataset_path = ''  # Update this path
train_folder = os.path.join(dataset_path, 'test')
csv_path = os.path.join(dataset_path, 'dataset/MMHS150K_Custom.csv')

# Create directories for hate and not_hate
hate_folder = os.path.join(train_folder, 'hate')
not_hate_folder = os.path.join(train_folder, 'not_hate')
os.makedirs(hate_folder, exist_ok=True)
os.makedirs(not_hate_folder, exist_ok=True)

# Read the CSV file
df = pd.read_csv(csv_path)

# Convert image IDs to the format they are saved with (i.e., add '.jpg')
image_names_in_csv = {f"{str(image_id)}.jpg" for image_id in df['user_id'].values}

# Iterate over all images in the train folder
for image_name in os.listdir(train_folder):
    # Skip the hate and not_hate folders if they exist in train_folder
    if image_name in ['hate', 'not_hate']:
        continue

    image_path = os.path.join(train_folder, image_name)

    if image_name in image_names_in_csv:
        # Get the label for the image from the CSV (by removing the '.jpg' part)
        image_id = image_name[:-4]  # Removing '.jpg'
        label = df[df['user_id'] == int(image_id)]['hateful_label'].values[0]
        print(label)

        # Move the image to the corresponding folder
        if label == 1:
            shutil.move(image_path, os.path.join(hate_folder, image_name))
        else:
            shutil.move(image_path, os.path.join(not_hate_folder, image_name))
    else:
        # Remove the image if it is not in the CSV
        os.remove(image_path)
        print(f"Removed image {image_name} as it is not in the CSV file.")

print("Processing complete: Images have been moved to 'hate' and 'not_hate' folders or removed if not listed in the CSV.")

1
Removed image 1023950570598686720.jpg as it is not in the CSV file.
1
0
Removed image 1023983638105477120.jpg as it is not in the CSV file.
1
Removed image 1024002338103517193.jpg as it is not in the CSV file.
1
Removed image 1024013508554326017.jpg as it is not in the CSV file.
Removed image 1024020103212552198.jpg as it is not in the CSV file.
Removed image 1024020887002132480.jpg as it is not in the CSV file.
1
Removed image 1024030883723001856.jpg as it is not in the CSV file.
0
0
Removed image 1024040183421972481.jpg as it is not in the CSV file.
0
Removed image 1024059571793526786.jpg as it is not in the CSV file.
Removed image 1024061302497529857.jpg as it is not in the CSV file.
Removed image 1024072353557487617.jpg as it is not in the CSV file.
Removed image 1024073949972504576.jpg as it is not in the CSV file.
0
Removed image 1024083507558019073.jpg as it is not in the CSV file.
1
Removed image 1024090834541727745.jpg as it is not in the CSV file.
Removed image 102409157572

# step 2: Model building

In [1]:
!pip install ultralytics
!pip install comet_ml

Collecting ultralytics
  Downloading ultralytics-8.2.18-py3-none-any.whl.metadata (40 kB)
     ---------------------------------------- 0.0/40.7 kB ? eta -:--:--
     ---------- ----------------------------- 10.2/40.7 kB ? eta -:--:--
     -------------------------------------- 40.7/40.7 kB 324.0 kB/s eta 0:00:00
Collecting py-cpuinfo (from ultralytics)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting seaborn>=0.11.0 (from ultralytics)
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading ultralytics-8.2.18-py3-none-any.whl (757 kB)
   ---------------------------------------- 0.0/757.2 kB ? eta -:--:--
   -------- ------------------------------- 153.6/757.2 kB 4.6 MB/s eta 0:00:01
   ---------------------------------------- 757.2/757.2 kB 9.6 MB/s eta 0:00:00
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Downl


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting comet_ml
  Downloading comet_ml-3.42.0-py3-none-any.whl.metadata (4.0 kB)
Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet_ml)
  Downloading everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting python-box<7.0.0 (from comet_ml)
  Downloading python_box-6.1.0-cp311-cp311-win_amd64.whl.metadata (7.8 kB)
Collecting requests-toolbelt>=0.8.0 (from comet_ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting semantic-version>=2.8.0 (from comet_ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting sentry-sdk>=1.1.0 (from comet_ml)
  Downloading sentry_sdk-2.2.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting simplejson (from comet_ml)
  Downloading simplejson-3.19.2-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting wurlitzer>=1.0.2 (from comet_ml)
  Downloading wurlitzer-3.1.0-py3-none-any.whl.metadata (2.5 kB)
Collecting dulwich!=0.20.33,>=0.20.6 (from comet_ml)
  Downl


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
DATA_DIR = 'data/hateful_dataset'  

In [3]:
import os
from ultralytics import YOLO
import comet_ml
from tqdm.notebook import tqdm
import ultralytics
ultralytics.checks()

comet_ml.init(project_name="train_comet")
exp = comet_ml.Experiment(project_name="coin_300epochs_imgsz324Purple")

#Load Model
# model = YOLO('/content/gdrive/MyDrive/ImageAnalysis/yolov8-cls.yaml')  ## Builds a New Model from Scratch
model = YOLO('DL_Project/yolov8-cls.yaml')

#Use Model
results = model.train(data = DATA_DIR, epochs = 1, imgsz = 324)   ## Train the Model

!scp -r /content/runs '/content/drive/MyDrive/ImageAnalysis/training_results'   ## For saving the results on Google Drive - Adjust based on the path

exp.end()

KeyboardInterrupt: 

# step 4: Model evaluation