# Shoes images clustering

The goal of this notebook is to combine shoes in pictures into big groups by their style with a help from crowd.

We are going to use a simple dataset consists of 82 images from Toloka tutorial (Drutsa et al. 2020)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Toloka/crowdclustering/blob/main/shoes_clustering.ipynb)

Prepare environment and import all we'll need.

In [None]:
!pip3 install toloka-kit==0.1.26
!pip3 install pandas
!pip3 install ipyplot

In [None]:
import os
import time
import logging

from datetime import datetime, timedelta
from sys import stdout
from time import sleep
from typing import Dict, List, Optional, Set
from random import sample, shuffle
from getpass import getpass

import ipyplot
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from PIL import Image
from urllib.request import urlopen

import toloka.client as toloka

from crowdkit.aggregation import DawidSkene
import json
from collections import Counter

from exam_check import exam_check
from calculate_quality import calculate_quality
from crowd_clustering_aggregation import clustering_aggregation, Prior, AggregationAssignment

logging.basicConfig(
    format='[%(levelname)s] %(name)s: %(message)s',
    level=logging.INFO,
    stream=stdout,
)

Сreate toloka-client instance. All api calls will go through it. More about OAuth token in Toloka-Kit [Learn the basics example](https://github.com/Toloka/toloka-kit/tree/main/examples/0.getting_started/0.learn_the_basics)

In [None]:
toloka_client = toloka.TolokaClient(getpass('Enter your token:'), 'PRODUCTION')  # Or switch to 'SANDBOX'
logging.info(toloka_client.get_requester())

## Creating Training and Exam project

As our task is rather uncommon for crowdsourcing platforms we need our workers to train before proceeding to main pools. So, this is how it is going to work:

* We create a Training and Exam project with training pool for workers to train and main pool to check workers' performance;
* After training workers are going to pass an main pool at least 3 times as an exam;
* After first exam worker is to be assigned a Shoes Clustering Skill, the skill value is calculated as percentage of correctly chosen clusters divided by number of exams expected (3 in our case);
* Only those workers who have been assigned the Shoes Clustering Skill with 100% skill value are allowed to perform on the main project

Let's start with creating Training and Exam project from a json file with the help of auxilliary functions:

In [None]:
def create_project_from_file(project_config_path: str, create: bool = True) -> toloka.Project:
    with open(project_config_path) as project_config_file:
        json_string = project_config_file.read()
    project = toloka.Project.from_json(json_string)
    if create:
        return toloka_client.create_project(project)
    else:
        return project

In [None]:
!git clone https://github.com/Toloka/crowdclustering.git

In [None]:
cd crowdclustering

In [None]:
training_exam_project = create_project_from_file('configs/shoes/training_exam/project.json')

Then we create training and add traininig tasks:

In [None]:
def create_training_from_file(training_config_path: str, project_id: str, create: bool = True) -> toloka.Training:
    with open(training_config_path) as training_config_file:
        json_string = training_config_file.read()
    training = toloka.Training.from_json(json_string)
    training.project_id = project_id
    if create:
        return toloka_client.create_training(training)
    else:
        return training

In [None]:
training = create_training_from_file('configs/shoes/training_exam/training.json', training_exam_project.id)

In [None]:
def create_tasks_from_directory(training_tasks_path: str, training_id: str, 
                                         create: bool = True) -> List[toloka.Task]:
    tasks = []
    
    for training_task_filename in os.listdir(training_tasks_path):
        training_task_file_path = os.path.join(training_tasks_path, training_task_filename)
        if os.path.isfile(training_task_file_path) and training_task_file_path.endswith('.json'):
            with open(training_task_file_path) as training_task_config_file:
                json_string = training_task_config_file.read()
            
            task = toloka.Task.from_json(json_string)
            task.pool_id = training_id
            
            tasks.append(task)
    if create:
        return toloka_client.create_tasks(tasks)
    else:
        return tasks

In [None]:
tasks = create_tasks_from_directory('configs/shoes/training_exam/training_tasks/', training.id)

And finally create pool with tasks to be an exam for workers:

In [None]:
def create_pool_from_file(pool_config_path: str, project_id: str, 
                          training_id: Optional[str] = None, create: bool = True) -> toloka.Pool:
    with open(pool_config_path) as pool_config_file:
        json_string = pool_config_file.read()
    pool = toloka.Pool.from_json(json_string)
    pool.project_id = project_id
    if training_id is not None:
        pool.quality_control.training_requirement.training_pool_id = training_id
    pool.will_expire = datetime.now() + timedelta(days=7)
    if create:
        return toloka_client.create_pool(pool)
    else:
        return pool

In [None]:
exam_pool = create_pool_from_file('configs/shoes/training_exam/exam_pool.json', training_exam_project.id, training.id)

In [None]:
def create_task_suites_from_directory(task_suites_path: str, pool_id: str) -> List[toloka.TaskSuite]:
    task_suites = []
    
    for task_suite_file_name in os.listdir(task_suites_path):
        task_suite_file_path = os.path.join(task_suites_path, task_suite_file_name)
        if os.path.isfile(task_suite_file_path) and task_suite_file_path.endswith('.json'):
            with open(task_suite_file_path) as task_config_file:
                json_string = task_config_file.read()
            
            task_suite = toloka.Task.from_json(json_string)
            task_suite.pool_id = pool_id
            
            task_suites.append(task_suite)
    
    return toloka_client.create_task_suites(task_suites, allow_defaults=True)

In [None]:
task_suites = create_task_suites_from_directory('configs/shoes/training_exam/exam_tasks/', exam_pool.id)

Last thing we do on this Training and Exam project preparation is skill creation:

In [None]:
skill = toloka_client.create_skill(name='Shoes clustering')

Our Training and Exam project is ready! But we won't open training and exam pools yet as we have payed exam. It is recommended to open payed exams exactly when they are needed and close when they aren't needed anymore.

## Creating a main project

Let's proceed on creating a main project.

In [None]:
main_project = create_project_from_file('configs/shoes/project/project.json')

In [None]:
main_pool = create_pool_from_file('configs/shoes/project/pool.json', main_project.id, create=False)

There is one thing we have to do manually: in a worker's filter we need to change skill id to the one we've created

In [None]:
main_pool.filter.and_[0].or_[0].key=skill.id
main_pool.quality_control.configs[3].rules[0].action.parameters.skill_id = skill.id
main_pool=toloka_client.create_pool(main_pool)

Now we create task suites:

In [None]:
data = pd.read_csv('configs/shoes/project/dataset.tsv')
images = set(x[0] for x in data.values)

objects_per_hit_number = 10 # number of objects in each HIT
objects_number = len(images) # number of objects
expecting_hits = min(objects_per_hit_number, 21) # expecting number of HITs to which a data item belongs

initial_items = max(objects_per_hit_number // expecting_hits, 1)
unique_hits = objects_number * expecting_hits // objects_per_hit_number
print('Initial items:', initial_items, '\nUnique HITs:', unique_hits)

In [None]:
def hit_samples(images: Set[str], unique_hits: int, objects_number: int, 
           objects_per_hit_number: int, initial_items: int) -> List[toloka.Task]:
    hits = np.array_split(list(images), min(objects_number, unique_hits))
    for i, hit in enumerate(hits):
        not_in_hit = images - set(hit)
        hit_sample = sample(not_in_hit, objects_per_hit_number - initial_items)
        hits[i] = np.append(hit, hit_sample)
    return hits

hits = hit_samples(images, unique_hits, objects_number, objects_per_hit_number, initial_items)

In [None]:
task_suites = [
    toloka.task_suite.TaskSuite(
        pool_id=main_pool.id,
        tasks=[
            toloka.task.Task(input_values={'images': hit.tolist()})
        ]
    )
    for hit in hits
]

task_suites = toloka_client.create_task_suites(task_suites, allow_defaults=True)

# Receiving responses

So, we have finished all preparations, now is the time to start labelling.

We are going to open all our pools: training, exam and main pool. All pools will stay open untill the main pool is finished.

In [None]:
period: timedelta = timedelta(seconds=60)

training = toloka_client.open_training(training.id)
exam_pool = toloka_client.open_pool(exam_pool.id)
main_pool = toloka_client.open_pool(main_pool.id)

while main_pool.is_open():
    exam_check(toloka_client, exam_pool, skill)
    op = toloka_client.get_analytics([toloka.analytics_request.CompletionPercentagePoolAnalytics(subject_id=main_pool.id)])
    percentage = toloka_client.wait_operation(op).details['value'][0]['result']['value']
    logging.info(f'Pool {main_pool.id} - {percentage}%')

    sleep(period.total_seconds())
    main_pool = toloka_client.get_pool(main_pool.id)

exam_check(toloka_client, exam_pool, skill)
training = toloka_client.close_training(training.id)
exam_pool = toloka_client.close_pool(exam_pool.id)

## Aggregation

After getting our data labelled, we need to aggregate our data using our implementation of algorithm described in [Crowdclustering by Ryan Gomes et. al.](http://vision.caltech.edu/~gomes/papers/crowd_clust_final.pdf) and [Incremental Learning of Nonparametric Bayesian Mixture Models by Ryan Gomes et. al.](http://www.vision.caltech.edu/gomes/papers/gomes_cvpr_08.pdf). Our implementation is based on [original Matlab implementation](http://www.vision.caltech.edu/gomes/software.html).

It will take quite a while for aggregation process to converge.

In [None]:
assignments_raw = toloka_client.get_assignments_df(pool_id=main_pool.id)[
        ['INPUT:images', 'OUTPUT:result', 'GOLDEN:result', 'ASSIGNMENT:link', 'ASSIGNMENT:assignment_id',
         'ASSIGNMENT:worker_id', 'ASSIGNMENT:status']]
prior = Prior(1, 5, 10, 1)
cluster_dict, id_to_img, clustering_result = clustering_aggregation(assignments_raw, 'INPUT:images', prior)

Finally, let's output results of clustering by crowd:

In [None]:
def draw_cluster(images):
    step = 30
    for i in range(0, len(images), step):
        part = images[i:i+step]
        plt.figure(figsize=(100, 100))
        for i, image_name in enumerate(part):
            ax = plt.subplot(len(images) // 5 + 1, 5, i + 1)
            img = Image.open(urlopen(image_name))
            im = ax.imshow(img)
            ax.axis('off')
        plt.show()

In [None]:
for i, cluster in cluster_dict.items():
    print(('-------------' + str(i) + '-------------').center(100))
    draw_cluster(cluster)

# Quality evaluation

After getting results, we can evaluate quality of clusters. To do this we use appoach called <i>Intruders</i> described in https://proceedings.neurips.cc/paper/2009/hash/f92586a25bb3145facd64ab20fd554ff-Abstract.html.

The idea is to make HITs with one image from one cluster and others from another cluster. We ask worker to find this extra image and if this object was found, then objects from these two clusters are quite different from each other. The proportion of correct answers will be "quality" in this case.

In [None]:
def make_intruder_hits(cluster_dict, N_HITS=100, images_on_page=8) -> List[List[str]]:
    hits_list = list()
    for _ in range(N_HITS):
        cluster, intrusor_cluster = sample(cluster_dict.keys(), 2)
        images = cluster_dict[cluster]
        intrusor_images = cluster_dict[intrusor_cluster]
        while len(images) < images_on_page - 1:
            cluster, intrusor_cluster = sample(cluster_dict.keys(), 2)
            images = cluster_dict[cluster]
            intrusor_images = cluster_dict[intrusor_cluster]
        representatives = sample(images, images_on_page - 1)
        intrusor = sample(intrusor_images, 1)
        HIT = representatives + intrusor
        shuffle(HIT)
        hits_list.append(HIT)
    return hits_list

In [None]:
N_HITS = 100
images_on_page = 8
intruder_hits = make_intruder_hits(cluster_dict, N_HITS, images_on_page)

## Create main project and training

In [None]:
intruders_project = create_project_from_file('configs/shoes/quality_evaluation/project/project.json')

In [None]:
intruders_training = create_training_from_file(
    'configs/shoes/quality_evaluation/project/training.json', intruders_project.id
)

In [None]:
tasks = create_tasks_from_directory('configs/shoes/quality_evaluation/training_tasks/', intruders_training.id)

In [None]:
intruders_pool = create_pool_from_file(
    'configs/shoes/quality_evaluation/project/pool.json', intruders_project.id, intruders_training.id
)

In [None]:
task_suites = [
    toloka.task_suite.TaskSuite(
        pool_id=intruders_pool.id,
        tasks=[
            toloka.task.Task(input_values={'images': hit})
        ]
    )
    for hit in intruder_hits
]

task_suites = toloka_client.create_task_suites(task_suites, allow_defaults=True)

## Receiving responses

In [None]:
period: timedelta = timedelta(seconds=60)

intruders_training = toloka_client.open_training(intruders_training.id)
intruders_pool = toloka_client.open_pool(intruders_pool.id)

while intruders_pool.is_open():
    op = toloka_client.get_analytics(
        [toloka.analytics_request.CompletionPercentagePoolAnalytics(subject_id=intruders_pool.id)]
    )
    percentage = toloka_client.wait_operation(op).details['value'][0]['result']['value']
    logging.info(f'Pool {intruders_pool.id} - {percentage}%')

    sleep(period.total_seconds())
    intruders_pool = toloka_client.get_pool(intruders_pool.id)

intruders_training = toloka_client.close_training(intruders_training.id)
intruders_pool = toloka_client.close_pool(intruders_pool.id)

In [None]:
assignments_raw = toloka_client.get_assignments_df(pool_id=intruders_pool.id)[
        ['INPUT:images', 'OUTPUT:answer', 'GOLDEN:answer', 'ASSIGNMENT:link', 'ASSIGNMENT:assignment_id',
         'ASSIGNMENT:worker_id', 'ASSIGNMENT:status']]

In [None]:
calculate_quality(assignments_raw, cluster_dict, intruder_hits)