# Imports

In [135]:
%reset -f

In [136]:
import shutil
import json
from pathlib import Path

from tqdm import tqdm
import pandas as pd

import requests
from bardapi import Bard, SESSION_HEADERS

In [137]:
PATH_PREFIX = '../'

PATHS = [
    PATH_PREFIX + 'multiling-multichoice-rc-master/dataset/Bulgarian',
    PATH_PREFIX + 'multiling-multichoice-rc-master/dataset/Croatian',
    PATH_PREFIX + 'multiling-multichoice-rc-master/dataset/Chinese',
    PATH_PREFIX + 'multiling-multichoice-rc-master/dataset/Hungarian',
    PATH_PREFIX + 'multiling-multichoice-rc-master/dataset/Polish',
    PATH_PREFIX + 'multiling-multichoice-rc-master/dataset/English',
]

QUESTIONS_COLS = ['id', 'answer_key', 'question_snapshot', 'question_number', 'grade', 'subject', 'language', 'date']

# Load data

In [138]:
def preprocess(filepath: str) -> str:
    if 'multimodal_dataset' in filepath:
        filepath = filepath[1:]
        filepath = filepath.replace('multimodal_dataset\\', 'dataset\\')
        filepath = filepath.replace('\\', '/')
    
    if filepath.startswith('data'):
        filepath = 'multiling-multichoice-rc-master/dataset/Chinese/' + filepath
    
    if 'Croatian/exams_to_annotate' in filepath:
        filepath = filepath.replace('exams_to_annotate', 'annotated')
    
    filepath = filepath.replace('JEE-Advanced/', '')

    filepath = PATH_PREFIX + filepath
    return filepath


paths = [Path(path) for path in PATHS]
assert all([path.exists() for path in paths]
            ), f'ERROR: Not all paths are present.'

annotations = []
questions = []

for path in paths:
    jsons = list(path.rglob('*.json'))
    # Because we use the annotation files
    # here there is an implicit filter
    # that removes the questions of type "instruction"
    exam_annotations = list(filter(lambda json_path: 'annotation' in json_path.stem, jsons))
    annotations.extend(exam_annotations)
    for exam in exam_annotations:
        questions.extend(json.loads(exam.read_bytes()))

for annotation_file in annotations:
    current_annotations = json.loads(annotation_file.read_bytes())
    for current_annotation in current_annotations:
        current_annotation['question']['question_snapshot'] = preprocess(current_annotation['question']['question_snapshot'])
        filepath = current_annotation['question']['question_snapshot']
        assert Path(filepath).exists(), f'ERROR: Path {filepath} does not exist!'

print('All images exist!')

df_questions = pd.json_normalize(questions)
df_questions.columns = QUESTIONS_COLS
df_questions['question_snapshot'] = df_questions['question_snapshot'].apply(preprocess)

assert df_questions['question_snapshot'].apply(lambda filepath: Path(filepath).exists()).all(), 'ERROR: Some image paths do not exist!'

df_questions

All images exist!


Unnamed: 0,id,answer_key,question_snapshot,question_number,grade,subject,language,date
0,6d2efd56-6bed-43ec-af58-653d0dac578d,А,../multiling-multichoice-rc-master/dataset/Bul...,1,12,Physics,Bulgarian,2019-05-23
1,da5dd867-163c-4400-a316-57e32ba072e0,Г,../multiling-multichoice-rc-master/dataset/Bul...,2,12,Physics,Bulgarian,2019-05-23
2,10d911fb-fca1-4a16-aad2-87254b13d52b,А,../multiling-multichoice-rc-master/dataset/Bul...,3,12,Physics,Bulgarian,2019-05-23
3,58495517-d5fa-4391-aefa-77405e9586d1,А,../multiling-multichoice-rc-master/dataset/Bul...,4,12,Physics,Bulgarian,2019-05-23
4,559bcfe6-e72a-4fb5-b0c9-5c07fd91a507,В,../multiling-multichoice-rc-master/dataset/Bul...,5,12,Physics,Bulgarian,2019-05-23
...,...,...,...,...,...,...,...,...
17415,e88b9858-72e6-4733-9fbe-84a30e79cf0d,B,../multiling-multichoice-rc-master/dataset/Eng...,26,12,Mathematics,English,2013
17416,a441aa2a-2cb8-4576-ab40-41ac48f33778,A,../multiling-multichoice-rc-master/dataset/Eng...,27,12,Mathematics,English,2013
17417,a4be2ba5-9d50-42a2-8943-47eebe3a7a7d,B,../multiling-multichoice-rc-master/dataset/Eng...,28,12,Mathematics,English,2013
17418,910bdfef-52e1-4f0e-bdf4-d6aa145974e3,D,../multiling-multichoice-rc-master/dataset/Eng...,29,12,Mathematics,English,2013


In [139]:
pd.concat([
    df_questions['language'].value_counts(dropna=False),
    df_questions['language'].value_counts(dropna=False, normalize=True).round(4) * 100,
], axis=1)

Unnamed: 0_level_0,count,proportion
language,Unnamed: 1_level_1,Unnamed: 2_level_1
Croatian,3973,22.81
Hungarian,3801,21.82
Chinese,2635,15.13
Polish,2511,14.41
Bulgarian,2132,12.24
German,819,4.7
English,480,2.76
French,439,2.52
Spanish,299,1.72
Serbian,227,1.3


In [140]:
# Use the path to each cropped image
# to form the path of the bounding box file.
# Because the filenames of the cropped images are sorted,
# the i-th row of the bounding box file corresponds to the i-th image.
# That should mean that if you zip the two, you'll get the type of the image.

def get_type(cropped_image_path: str) -> str:
    cropped_image_path = Path(cropped_image_path)
    parent = cropped_image_path.parent

    if 'cropped_images' not in parent.name:
        return parent.name
    
    bbox_path = parent.parent / 'bbox'
    if not bbox_path.exists():
        bbox_path = parent.parent / 'bboxes'
    assert bbox_path.exists(), f'ERROR: Path {bbox_path} does not exist!'

    parts = cropped_image_path.stem.split('_')
    assert len(parts) == 4, f'ERROR: Length of "parts" has to be 4, but is {len(parts)}!'

    _, bbox_num, _, orig_line_num = parts
    bbox_num, orig_line_num = int(bbox_num), int(orig_line_num)

    bbox_path_page = bbox_path / f'page_{bbox_num}.txt'
    if not bbox_path_page.exists():
        bbox_path_page = bbox_path / f'page_0{bbox_num}.txt'
    assert bbox_path_page.exists(), f'ERROR: Cannot find {bbox_path_page}!'

    with open(bbox_path_page) as f:
        lines = f.readlines()
    
    n_lines = len(lines)
    line_num = orig_line_num
    if n_lines <= orig_line_num - 2:
        line_num = orig_line_num // n_lines - 1
    elif n_lines <= orig_line_num:
        line_num = orig_line_num - 2
    assert n_lines > line_num, f'ERROR: Line numbers do not math {n_lines=} <= {line_num=} for {str(cropped_image_path)=}!'

    question_type_idx = lines[line_num][0]
    question_type = 'text' if question_type_idx == '0' else 'image_text'
    return question_type


df_questions['type'] = 'NA'
idxs = df_questions.query('language != "Chinese"').index
df_questions.loc[idxs, 'type'] = df_questions.loc[idxs, 'question_snapshot'].apply(get_type)
df_questions['type'].value_counts(dropna=False)

type
text          11928
image_text     2857
NA             2635
Name: count, dtype: int64

In [141]:
pd.crosstab(df_questions['language'], df_questions['type'], margins=True, dropna=False)

type,NA,image_text,text,All
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bulgarian,0,501,1631,2132
Chinese,2635,0,0,2635
Croatian,0,713,3260,3973
English,0,123,357,480
French,0,66,373,439
German,0,174,645,819
Hungarian,0,695,3106,3801
Italian,0,4,40,44
Polish,0,421,2090,2511
Romanian,0,0,5,5


In [142]:
df_questions.groupby(by=['language', 'grade', 'type']).agg(number_questions=('id', 'count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,number_questions
language,grade,type,Unnamed: 3_level_1
Bulgarian,4,image_text,41
Bulgarian,4,text,456
Bulgarian,12,image_text,460
Bulgarian,12,text,1175
Chinese,4,,2635
Croatian,12,image_text,713
Croatian,12,text,3260
English,12,image_text,123
English,12,text,357
French,12,image_text,66


In [143]:
df_questions.groupby(by=['language', 'subject']).agg(number_questions=('id', 'count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,number_questions
language,subject,Unnamed: 2_level_1
Bulgarian,Chemistry,665
Bulgarian,Man and Nature,233
Bulgarian,Man and Society,264
Bulgarian,Physics,970
Chinese,Biology,281
...,...,...
Serbian,Geography,6
Serbian,Physics,185
Slovakian,Chemistry,46
Spanish,Geography,19


# Normalize the subjects

## Initial frequency analysis

In [144]:
df_questions['subject'].value_counts(dropna=False).sort_index()

subject
Accounting                                                           40
Administration                                                       37
Agriculture                                                         652
Assisting the dentist and keeping the office ready for work          80
Biology                                                             855
                                                                   ... 
Sociology                                                           295
Tourism                                                              76
Use of vehicles, machines, devices and tools used in agriculture     40
customer service in administration units                             40
organizing and conducting the motor vehicle service process          40
Name: count, Length: 81, dtype: int64

In [145]:
df_questions.groupby(by='language')['subject'].nunique()

language
Bulgarian     4
Chinese       6
Croatian     17
English       3
French        3
German        5
Hungarian     8
Italian       2
Polish       55
Romanian      1
Russian       1
Serbian       4
Slovakian     1
Spanish       2
Name: subject, dtype: int64

## Aggregate subjects

In [146]:
subjects_polish = df_questions.query('language == "Polish"')['subject'].unique().tolist()
subjects_polish[:5]

['Operation of machines and equipment for earthworks and roadworks',
 'Carrying out agricultural production',
 'Installation and operation of computer systems, peripheral devices and networks',
 'Providing help and organizing support for disabled people',
 'Installation, commissioning and maintenance of ICT devices and networks']

In [147]:
subjects_to_combine = {
    'Agriculture': {},
    'Biology': {'Man and Nature'},
    'Business & Economics': {'Administration', 'Economics'},
    'Chemistry': {},
    'Citizenship': {}, # Specific for Vietnamese
    'Fine Arts': {},
    'Forestry': {}, # studies the craft of managing, using, conserving, and repairing forests, woodlands, and associated resources around them such as water sources and soil.
    'Geography': {},
    'Geology': {}, # Economical Geology, Marine Geology, Geomorphology, and Geophysics.
    'History': {},
    'Informatics': {},
    'Islamic Studies': {}, # refers to the academic studies of Islam, Quran excerpts, and Muslim morality. This a subject studied in the Qatari educational system during both middle and high school.
    'Landscaping': {'Gardening'},
    'Philosophy': {},
    'Physics': {},
    'Politics': {'Politics', 'Politics and Economics'},
    'Professional': set(subjects_polish),
    'Psychology': {'Psychology', 'Psycology'},
    'Religion': {'Religion', 'Religious Education', 'Religious Studies'},
    'Science': {'Math', 'Mathematics'},
    'Social': {}, # subject, similarly to Science, combines questions from political, cultural, historical and geographical studies.
    'Sociology': {'Man and Society'},
    'Ethics': {},
    'Tourism': {},
}

df_questions['subject_aggregated'] = df_questions['subject'].copy()
for aggregation, subjects in subjects_to_combine.items():
    df_questions.loc[df_questions['subject'].isin(subjects_to_combine[aggregation]), 'subject_aggregated'] = aggregation

pd.concat([
    df_questions['subject_aggregated'].value_counts(dropna=False).sort_index(),
    df_questions['subject_aggregated'].value_counts(dropna=False, normalize=True).sort_index().round(4) * 100,
], axis=1)

Unnamed: 0_level_0,count,proportion
subject_aggregated,Unnamed: 1_level_1,Unnamed: 2_level_1
Agriculture,652,3.74
Biology,1088,6.25
Business & Economics,1149,6.6
Chemistry,2437,13.99
Ethics,180,1.03
Fine Arts,48,0.28
Geography,1259,7.23
History,709,4.07
Informatics,188,1.08
Landscaping,27,0.15


In [148]:
print(f'Aggregations for which there are no actual subjects: {set(subjects_to_combine.keys()) - set(df_questions["subject_aggregated"].unique())}')
print(f'Actual subjects for which there are no aggregations (should be empty): {set(df_questions["subject_aggregated"].unique()) - set(subjects_to_combine.keys())}')

Aggregations for which there are no actual subjects: {'Forestry', 'Islamic Studies', 'Geology', 'Social', 'Citizenship'}
Actual subjects for which there are no aggregations (should be empty): set()


In [149]:
pd.concat([
    df_questions['subject_aggregated'].value_counts(dropna=False).sort_index(),
    df_questions['subject_aggregated'].value_counts(dropna=False, normalize=True).sort_index().round(4) * 100,
], axis=1)

Unnamed: 0_level_0,count,proportion
subject_aggregated,Unnamed: 1_level_1,Unnamed: 2_level_1
Agriculture,652,3.74
Biology,1088,6.25
Business & Economics,1149,6.6
Chemistry,2437,13.99
Ethics,180,1.03
Fine Arts,48,0.28
Geography,1259,7.23
History,709,4.07
Informatics,188,1.08
Landscaping,27,0.15


In [150]:
# Number of subjects per language
df_questions.groupby(by=['language', 'grade'])['subject_aggregated'].nunique()

language   grade
Bulgarian  4         2
           12        2
Chinese    4         6
Croatian   12       13
English    12        3
French     12        3
German     12        5
Hungarian  12        7
Italian    12        2
Polish     12        1
Romanian   12        1
Russian    12        1
Serbian    12        4
Slovakian  12        1
Spanish    12        2
Name: subject_aggregated, dtype: int64

## Group aggregated subjects

In [151]:
aggregations_to_combine = {
    'Natural Science': {'Biology', 'Geology', 'Chemistry', 'Physics', 'Science'}, # the study of natural phenomena
    'Social Sciences': {'Geography', 'History', 'Psychology', 'Politics', 'Social', 'Citizenship', 'Philosophy', 'Business & Economics', 'Sociology', 'Ethics'}, # the study of human behavior and societies
    'Other': {'Agriculture', 'Fine Arts', 'Forestry', 'Informatics', 'Islamic Studies', 'Landscaping', 'Professional', 'Religion', 'Tourism'}, # Applied Studies, Arts, Religion, etc.
}

df_questions['subject_aggregated_grouped'] = df_questions['subject_aggregated'].copy()
for superset, subjects in aggregations_to_combine.items():
    df_questions.loc[df_questions['subject_aggregated'].isin(aggregations_to_combine[superset]), 'subject_aggregated_grouped'] = superset

pd.concat([
    df_questions['subject_aggregated_grouped'].value_counts(dropna=False).sort_index(),
    df_questions['subject_aggregated_grouped'].value_counts(dropna=False, normalize=True).sort_index().round(4) * 100,
], axis=1)

Unnamed: 0_level_0,count,proportion
subject_aggregated_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1
Natural Science,9333,53.58
Other,3663,21.03
Social Sciences,4424,25.4


# Create `test` sample

In [152]:
df_questions

Unnamed: 0,id,answer_key,question_snapshot,question_number,grade,subject,language,date,type,subject_aggregated,subject_aggregated_grouped
0,6d2efd56-6bed-43ec-af58-653d0dac578d,А,../multiling-multichoice-rc-master/dataset/Bul...,1,12,Physics,Bulgarian,2019-05-23,text,Physics,Natural Science
1,da5dd867-163c-4400-a316-57e32ba072e0,Г,../multiling-multichoice-rc-master/dataset/Bul...,2,12,Physics,Bulgarian,2019-05-23,image_text,Physics,Natural Science
2,10d911fb-fca1-4a16-aad2-87254b13d52b,А,../multiling-multichoice-rc-master/dataset/Bul...,3,12,Physics,Bulgarian,2019-05-23,image_text,Physics,Natural Science
3,58495517-d5fa-4391-aefa-77405e9586d1,А,../multiling-multichoice-rc-master/dataset/Bul...,4,12,Physics,Bulgarian,2019-05-23,image_text,Physics,Natural Science
4,559bcfe6-e72a-4fb5-b0c9-5c07fd91a507,В,../multiling-multichoice-rc-master/dataset/Bul...,5,12,Physics,Bulgarian,2019-05-23,image_text,Physics,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
17415,e88b9858-72e6-4733-9fbe-84a30e79cf0d,B,../multiling-multichoice-rc-master/dataset/Eng...,26,12,Mathematics,English,2013,text,Science,Natural Science
17416,a441aa2a-2cb8-4576-ab40-41ac48f33778,A,../multiling-multichoice-rc-master/dataset/Eng...,27,12,Mathematics,English,2013,text,Science,Natural Science
17417,a4be2ba5-9d50-42a2-8943-47eebe3a7a7d,B,../multiling-multichoice-rc-master/dataset/Eng...,28,12,Mathematics,English,2013,text,Science,Natural Science
17418,910bdfef-52e1-4f0e-bdf4-d6aa145974e3,D,../multiling-multichoice-rc-master/dataset/Eng...,29,12,Mathematics,English,2013,text,Science,Natural Science


## Remove non-eligible triplets

In [153]:
num = 20
group = ['language', 'subject_aggregated', 'type']

df_num_per_group_check = df_questions.groupby(by=group).agg(number_questions=('id', 'count'))
df_num_per_group_check['has_at_least_num'] = df_num_per_group_check['number_questions'] >= num

df_num_per_group_check

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,number_questions,has_at_least_num
language,subject_aggregated,type,Unnamed: 3_level_1,Unnamed: 4_level_1
Bulgarian,Biology,image_text,25,True
Bulgarian,Biology,text,208,True
Bulgarian,Chemistry,image_text,120,True
Bulgarian,Chemistry,text,545,True
Bulgarian,Physics,image_text,340,True
...,...,...,...,...
Slovakian,Chemistry,image_text,4,False
Slovakian,Chemistry,text,42,True
Spanish,Geography,text,19,False
Spanish,Physics,image_text,89,True


In [154]:
df_num_per_group_check['has_at_least_num'].value_counts(dropna=False)

has_at_least_num
True     65
False    16
Name: count, dtype: int64

In [155]:
# Eligible triplets
df_num_per_group_check.query('has_at_least_num').drop('has_at_least_num', axis=1).sort_values(by='number_questions', ascending=False).reset_index()

Unnamed: 0,language,subject_aggregated,type,number_questions
0,Polish,Professional,text,2090
1,Hungarian,Physics,text,1010
2,Hungarian,Business & Economics,text,727
3,Chinese,Geography,,678
4,Chinese,Science,,678
...,...,...,...,...
60,Bulgarian,Biology,image_text,25
61,Italian,Geography,text,24
62,French,Geography,text,24
63,Serbian,Chemistry,text,23


In [156]:
# Noneligible triplets
df_num_per_group_check.query('~has_at_least_num').drop('has_at_least_num', axis=1).sort_values(by='number_questions', ascending=False).reset_index()

Unnamed: 0,language,subject_aggregated,type,number_questions
0,Spanish,Geography,text,19
1,Bulgarian,Sociology,image_text,16
2,Italian,Physics,text,16
3,German,Chemistry,text,11
4,Russian,Geography,text,9
5,Serbian,Chemistry,image_text,7
6,Serbian,Business & Economics,text,6
7,Serbian,Geography,text,6
8,French,Business & Economics,image_text,5
9,Romanian,Geography,text,5


In [157]:
df_eligible = df_num_per_group_check.query('has_at_least_num')
eligible = df_eligible.index.tolist()
print(f'{df_eligible.shape=}')
print(f'{eligible=}')

df_eligible.shape=(65, 2)
eligible=[('Bulgarian', 'Biology', 'image_text'), ('Bulgarian', 'Biology', 'text'), ('Bulgarian', 'Chemistry', 'image_text'), ('Bulgarian', 'Chemistry', 'text'), ('Bulgarian', 'Physics', 'image_text'), ('Bulgarian', 'Physics', 'text'), ('Bulgarian', 'Sociology', 'text'), ('Chinese', 'Biology', 'NA'), ('Chinese', 'Chemistry', 'NA'), ('Chinese', 'Geography', 'NA'), ('Chinese', 'History', 'NA'), ('Chinese', 'Physics', 'NA'), ('Chinese', 'Science', 'NA'), ('Croatian', 'Biology', 'image_text'), ('Croatian', 'Biology', 'text'), ('Croatian', 'Chemistry', 'image_text'), ('Croatian', 'Chemistry', 'text'), ('Croatian', 'Ethics', 'text'), ('Croatian', 'Fine Arts', 'image_text'), ('Croatian', 'Geography', 'image_text'), ('Croatian', 'Geography', 'text'), ('Croatian', 'History', 'image_text'), ('Croatian', 'History', 'text'), ('Croatian', 'Informatics', 'image_text'), ('Croatian', 'Informatics', 'text'), ('Croatian', 'Philosophy', 'text'), ('Croatian', 'Physics', 'image_te

## Full test sample

In [158]:
df_questions_test_sample_full = df_questions.set_index(group)
df_questions_test_sample_full = df_questions_test_sample_full.iloc[df_questions_test_sample_full.index.isin(eligible)]
df_questions_test_sample_full = df_questions_test_sample_full.reset_index(drop=False)
df_questions_test_sample_full

Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped
0,Bulgarian,Physics,text,6d2efd56-6bed-43ec-af58-653d0dac578d,А,../multiling-multichoice-rc-master/dataset/Bul...,1,12,Physics,2019-05-23,Natural Science
1,Bulgarian,Physics,image_text,da5dd867-163c-4400-a316-57e32ba072e0,Г,../multiling-multichoice-rc-master/dataset/Bul...,2,12,Physics,2019-05-23,Natural Science
2,Bulgarian,Physics,image_text,10d911fb-fca1-4a16-aad2-87254b13d52b,А,../multiling-multichoice-rc-master/dataset/Bul...,3,12,Physics,2019-05-23,Natural Science
3,Bulgarian,Physics,image_text,58495517-d5fa-4391-aefa-77405e9586d1,А,../multiling-multichoice-rc-master/dataset/Bul...,4,12,Physics,2019-05-23,Natural Science
4,Bulgarian,Physics,image_text,559bcfe6-e72a-4fb5-b0c9-5c07fd91a507,В,../multiling-multichoice-rc-master/dataset/Bul...,5,12,Physics,2019-05-23,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
17298,English,Science,text,e88b9858-72e6-4733-9fbe-84a30e79cf0d,B,../multiling-multichoice-rc-master/dataset/Eng...,26,12,Mathematics,2013,Natural Science
17299,English,Science,text,a441aa2a-2cb8-4576-ab40-41ac48f33778,A,../multiling-multichoice-rc-master/dataset/Eng...,27,12,Mathematics,2013,Natural Science
17300,English,Science,text,a4be2ba5-9d50-42a2-8943-47eebe3a7a7d,B,../multiling-multichoice-rc-master/dataset/Eng...,28,12,Mathematics,2013,Natural Science
17301,English,Science,text,910bdfef-52e1-4f0e-bdf4-d6aa145974e3,D,../multiling-multichoice-rc-master/dataset/Eng...,29,12,Mathematics,2013,Natural Science


## Split for sampling (remove Chinese)

In [159]:
df_to_sample = df_eligible.query('number_questions > 50')
triplets_sample = df_to_sample.index.tolist()

df_ready = df_eligible.query('number_questions <= 50')
triplets_ready = df_ready.index.tolist()

## Remove Chinese
# triplets_sample = [el for el in triplets_sample if el[0] != 'Chinese']
# triplets_ready = [el for el in triplets_ready if el[0] != 'Chinese']

print('Triplets that are ready:')
print(f'{df_ready.shape=}')
print(f'{triplets_ready=}')

print('\nTriplets that need sampling:')
print(f'{df_to_sample.shape=}')
print(f'{triplets_sample=}')

Triplets that are ready:
df_ready.shape=(12, 2)
triplets_ready=[('Bulgarian', 'Biology', 'image_text'), ('Croatian', 'Fine Arts', 'image_text'), ('Croatian', 'Informatics', 'image_text'), ('French', 'Geography', 'text'), ('German', 'Geography', 'text'), ('German', 'Tourism', 'text'), ('Hungarian', 'Business & Economics', 'image_text'), ('Hungarian', 'Landscaping', 'text'), ('Hungarian', 'Tourism', 'text'), ('Italian', 'Geography', 'text'), ('Serbian', 'Chemistry', 'text'), ('Slovakian', 'Chemistry', 'text')]

Triplets that need sampling:
df_to_sample.shape=(53, 2)
triplets_sample=[('Bulgarian', 'Biology', 'text'), ('Bulgarian', 'Chemistry', 'image_text'), ('Bulgarian', 'Chemistry', 'text'), ('Bulgarian', 'Physics', 'image_text'), ('Bulgarian', 'Physics', 'text'), ('Bulgarian', 'Sociology', 'text'), ('Chinese', 'Biology', 'NA'), ('Chinese', 'Chemistry', 'NA'), ('Chinese', 'Geography', 'NA'), ('Chinese', 'History', 'NA'), ('Chinese', 'Physics', 'NA'), ('Chinese', 'Science', 'NA'), ('Croa

## Sample

In [160]:
df_questions_test_sample_more_than_50 = df_questions.set_index(group)
df_questions_test_sample_more_than_50 = df_questions_test_sample_more_than_50.iloc[df_questions_test_sample_more_than_50.index.isin(triplets_sample)]
df_questions_test_sample_more_than_50 = df_questions_test_sample_more_than_50.reset_index(drop=False)
df_questions_test_sample_more_than_50

Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped
0,Bulgarian,Physics,text,6d2efd56-6bed-43ec-af58-653d0dac578d,А,../multiling-multichoice-rc-master/dataset/Bul...,1,12,Physics,2019-05-23,Natural Science
1,Bulgarian,Physics,image_text,da5dd867-163c-4400-a316-57e32ba072e0,Г,../multiling-multichoice-rc-master/dataset/Bul...,2,12,Physics,2019-05-23,Natural Science
2,Bulgarian,Physics,image_text,10d911fb-fca1-4a16-aad2-87254b13d52b,А,../multiling-multichoice-rc-master/dataset/Bul...,3,12,Physics,2019-05-23,Natural Science
3,Bulgarian,Physics,image_text,58495517-d5fa-4391-aefa-77405e9586d1,А,../multiling-multichoice-rc-master/dataset/Bul...,4,12,Physics,2019-05-23,Natural Science
4,Bulgarian,Physics,image_text,559bcfe6-e72a-4fb5-b0c9-5c07fd91a507,В,../multiling-multichoice-rc-master/dataset/Bul...,5,12,Physics,2019-05-23,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
16912,English,Science,text,e88b9858-72e6-4733-9fbe-84a30e79cf0d,B,../multiling-multichoice-rc-master/dataset/Eng...,26,12,Mathematics,2013,Natural Science
16913,English,Science,text,a441aa2a-2cb8-4576-ab40-41ac48f33778,A,../multiling-multichoice-rc-master/dataset/Eng...,27,12,Mathematics,2013,Natural Science
16914,English,Science,text,a4be2ba5-9d50-42a2-8943-47eebe3a7a7d,B,../multiling-multichoice-rc-master/dataset/Eng...,28,12,Mathematics,2013,Natural Science
16915,English,Science,text,910bdfef-52e1-4f0e-bdf4-d6aa145974e3,D,../multiling-multichoice-rc-master/dataset/Eng...,29,12,Mathematics,2013,Natural Science


In [161]:
df_questions_test_sample_part2 = df_questions_test_sample_more_than_50.groupby(by=group).sample(n=50, random_state=42).reset_index(drop=True)
df_questions_test_sample_part2

Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped
0,Bulgarian,Biology,text,fa8b2d6f-5ffe-4fcd-8e6e-3c6f10e16246,б,../multiling-multichoice-rc-master/dataset/Bul...,4,4,Man and Nature,2011-05-10,Natural Science
1,Bulgarian,Biology,text,20c70435-a53e-4aeb-b997-d508a7b15d8f,В,../multiling-multichoice-rc-master/dataset/Bul...,4,4,Man and Nature,2012-10-03,Natural Science
2,Bulgarian,Biology,text,378b4e57-5e9d-49bd-9c17-4bc07dc5be70,в,../multiling-multichoice-rc-master/dataset/Bul...,6,4,Man and Nature,2010-05-12,Natural Science
3,Bulgarian,Biology,text,e40a7130-8a93-4f7f-92e4-1bca8c180f71,В,../multiling-multichoice-rc-master/dataset/Bul...,11,4,Man and Nature,2012-05-14,Natural Science
4,Bulgarian,Biology,text,54b8b2e4-5918-4375-b17b-5fe29b3d01b3,а,../multiling-multichoice-rc-master/dataset/Bul...,12,4,Man and Nature,2011-05-10,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
2645,Spanish,Physics,text,7e4ff589-7699-4d51-b93b-57c3624eb922,B,../multiling-multichoice-rc-master/dataset/Hun...,13,12,Physics,2008-11-03,Natural Science
2646,Spanish,Physics,text,29dc1421-7acf-4751-bb4d-088e1908f92f,C,../multiling-multichoice-rc-master/dataset/Hun...,11,12,Physics,2012-05-17,Natural Science
2647,Spanish,Physics,text,2624af36-12e1-4b1b-bf23-8442f91bdd64,C,../multiling-multichoice-rc-master/dataset/Hun...,13,12,Physics,2017-05-22,Natural Science
2648,Spanish,Physics,text,f9203c52-3f86-4027-914d-1cf383505c3f,A,../multiling-multichoice-rc-master/dataset/Hun...,20,12,Physics,2006-05-15,Natural Science


In [162]:
df_questions_test_sample_ready = df_questions.set_index(group)
df_questions_test_sample_ready = df_questions_test_sample_ready.iloc[df_questions_test_sample_ready.index.isin(triplets_ready)]
df_questions_test_sample_ready = df_questions_test_sample_ready.reset_index(drop=False)
df_questions_test_sample_ready

Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped
0,Bulgarian,Biology,image_text,7028d47d-c221-4021-b3d2-969eb4eea85b,В,../multiling-multichoice-rc-master/dataset/Bul...,9,4,Man and Nature,2014-05-12,Natural Science
1,Bulgarian,Biology,image_text,138519c5-1952-4286-81ba-329e5dfb8cea,Б,../multiling-multichoice-rc-master/dataset/Bul...,10,4,Man and Nature,2014-05-12,Natural Science
2,Bulgarian,Biology,image_text,f0d208e5-9a6d-4b26-9bdc-29f309040fed,А,../multiling-multichoice-rc-master/dataset/Bul...,15,4,Man and Nature,2014-05-12,Natural Science
3,Bulgarian,Biology,image_text,46ec0375-f51e-4d8d-bfc2-8255c85f4a4e,А,../multiling-multichoice-rc-master/dataset/Bul...,16,4,Man and Nature,2014-05-12,Natural Science
4,Bulgarian,Biology,image_text,44cfa1bc-2536-4def-823a-dff1682118a0,В,../multiling-multichoice-rc-master/dataset/Bul...,6,4,Man and Nature,2012-10-03,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
381,Hungarian,Tourism,text,0dd9a8ed-94f8-49ac-9ccb-9edc65ced1b3,D,../multiling-multichoice-rc-master/dataset/Hun...,4,12,Tourism,2018-10-19,Other
382,Hungarian,Tourism,text,9bf58153-883c-4130-a439-15b8fe994941,C,../multiling-multichoice-rc-master/dataset/Hun...,5,12,Tourism,2018-10-19,Other
383,Hungarian,Tourism,text,680dd8f2-c67b-45b4-b22c-58b9463c9d56,B,../multiling-multichoice-rc-master/dataset/Hun...,6,12,Tourism,2018-10-19,Other
384,Hungarian,Tourism,text,06ab008d-2f33-4ba7-b857-572cdaf38771,C,../multiling-multichoice-rc-master/dataset/Hun...,7,12,Tourism,2018-10-19,Other


## Merge to obtain final test sample

In [163]:
df_test = pd.concat([
    df_questions_test_sample_ready,
    df_questions_test_sample_part2
], ignore_index=True)
df_test

Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped
0,Bulgarian,Biology,image_text,7028d47d-c221-4021-b3d2-969eb4eea85b,В,../multiling-multichoice-rc-master/dataset/Bul...,9,4,Man and Nature,2014-05-12,Natural Science
1,Bulgarian,Biology,image_text,138519c5-1952-4286-81ba-329e5dfb8cea,Б,../multiling-multichoice-rc-master/dataset/Bul...,10,4,Man and Nature,2014-05-12,Natural Science
2,Bulgarian,Biology,image_text,f0d208e5-9a6d-4b26-9bdc-29f309040fed,А,../multiling-multichoice-rc-master/dataset/Bul...,15,4,Man and Nature,2014-05-12,Natural Science
3,Bulgarian,Biology,image_text,46ec0375-f51e-4d8d-bfc2-8255c85f4a4e,А,../multiling-multichoice-rc-master/dataset/Bul...,16,4,Man and Nature,2014-05-12,Natural Science
4,Bulgarian,Biology,image_text,44cfa1bc-2536-4def-823a-dff1682118a0,В,../multiling-multichoice-rc-master/dataset/Bul...,6,4,Man and Nature,2012-10-03,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
3031,Spanish,Physics,text,7e4ff589-7699-4d51-b93b-57c3624eb922,B,../multiling-multichoice-rc-master/dataset/Hun...,13,12,Physics,2008-11-03,Natural Science
3032,Spanish,Physics,text,29dc1421-7acf-4751-bb4d-088e1908f92f,C,../multiling-multichoice-rc-master/dataset/Hun...,11,12,Physics,2012-05-17,Natural Science
3033,Spanish,Physics,text,2624af36-12e1-4b1b-bf23-8442f91bdd64,C,../multiling-multichoice-rc-master/dataset/Hun...,13,12,Physics,2017-05-22,Natural Science
3034,Spanish,Physics,text,f9203c52-3f86-4027-914d-1cf383505c3f,A,../multiling-multichoice-rc-master/dataset/Hun...,20,12,Physics,2006-05-15,Natural Science


## Subsample 10 from group

In [164]:
df_test_mini = df_test.groupby(by=group).sample(n=10, random_state=42).reset_index(drop=True)
df_test_mini

Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped
0,Bulgarian,Biology,image_text,223f1e74-14b0-4d8a-bc0b-f2240aa1c027,Б,../multiling-multichoice-rc-master/dataset/Bul...,1,4,Man and Nature,2013-05-15,Natural Science
1,Bulgarian,Biology,image_text,558bcd64-8770-47d8-9bb0-ecbd084ba07c,В,../multiling-multichoice-rc-master/dataset/Bul...,20,4,Man and Nature,2012-05-14,Natural Science
2,Bulgarian,Biology,image_text,7028d47d-c221-4021-b3d2-969eb4eea85b,В,../multiling-multichoice-rc-master/dataset/Bul...,9,4,Man and Nature,2014-05-12,Natural Science
3,Bulgarian,Biology,image_text,da86ffde-ecb3-4f61-a02c-d1155023ce32,Б,../multiling-multichoice-rc-master/dataset/Bul...,8,4,Man and Nature,2017-05-15,Natural Science
4,Bulgarian,Biology,image_text,a4e787d6-2a25-4c1b-aa12-adc704c683f8,А,../multiling-multichoice-rc-master/dataset/Bul...,12,4,Man and Nature,2018-05-17,Natural Science
...,...,...,...,...,...,...,...,...,...,...,...
645,Spanish,Physics,text,27f90aa7-297a-4763-b4e7-5eccbd4f2448,A,../multiling-multichoice-rc-master/dataset/Hun...,5,12,Physics,2018-05-22,Natural Science
646,Spanish,Physics,text,733583aa-74de-451a-84fc-443a2144b9ac,B,../multiling-multichoice-rc-master/dataset/Hun...,3,12,Physics,2006-05-15,Natural Science
647,Spanish,Physics,text,d41cd2d5-5767-4d99-a905-758b3033342d,C,../multiling-multichoice-rc-master/dataset/Hun...,9,12,Physics,2013-05-16,Natural Science
648,Spanish,Physics,text,6591ed40-2b27-4c4d-b7bc-faed1ce34a43,A,../multiling-multichoice-rc-master/dataset/Hun...,9,12,Physics,2007-05-14,Natural Science


## Checks

In [165]:
df_test.groupby(by=group).agg(number_questions=('id', 'count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,number_questions
language,subject_aggregated,type,Unnamed: 3_level_1
Bulgarian,Biology,image_text,25
Bulgarian,Biology,text,50
Bulgarian,Chemistry,image_text,50
Bulgarian,Chemistry,text,50
Bulgarian,Physics,image_text,50
...,...,...,...
Serbian,Physics,image_text,50
Serbian,Physics,text,50
Slovakian,Chemistry,text,42
Spanish,Physics,image_text,50


In [None]:
df_test_mini.groupby(by=group).agg(number_questions=('id', 'count'))

# Create folder structure

## Test sample

In [None]:
path_test_dataset = Path('./test_dataset')
path_test_dataset

In [None]:
if path_test_dataset.exists():
    shutil.rmtree(path_test_dataset)
path_test_dataset.mkdir()


def get_annotation_file(cropped_image: Path, language: str, subject: str) -> dict | None:
    for annotation_file in annotations:
        current_annotations = json.loads(annotation_file.read_bytes())
        for idx, current_annotation in enumerate(current_annotations):
            if current_annotation['info']['language'] == language \
                and current_annotation['info']['subject'] == subject:
                if str(cropped_image) == preprocess(current_annotation['question']['question_snapshot']):
                    return current_annotations[idx]
    return None


types = ['image_text', 'text']
languages = df_test['language'].unique()

for language in tqdm(languages):
    path_lang_folder = path_test_dataset / language
    path_lang_folder.mkdir()
    
    subjects_agg = df_test.query(f'language == "{language}"')['subject_aggregated'].unique()

    for subject_agg in subjects_agg:
        path_subj_folder = path_lang_folder / subject_agg
        path_subj_folder.mkdir()

        for q_type in types:
            annotations_q_type = []
            path_type_folder = path_subj_folder / q_type
            path_type_folder_ims = path_subj_folder / q_type / 'images'
            path_type_folder_ims_annotations = path_subj_folder / q_type / 'annotations.json'
            path_type_folder.mkdir()
            path_type_folder_ims.mkdir()

            query = f'language == "{language}" and subject_aggregated == "{subject_agg}" and type == "{q_type}"'
            df_subset = df_test.query(query)

            for idx, row in df_subset.iterrows():
                src = Path(row['question_snapshot'])
                subject = row['subject']
                dst = path_type_folder_ims / f'{idx:02}_{src.name}'

                assert src.exists(), f'ERROR: File {src} not found!'

                if '541_page_02_cropped_05' in str(dst):
                    print(src)

                shutil.copyfile(src, dst)

                annotation = get_annotation_file(src, language, subject)
                assert annotation is not None, f'ERROR: No annotation file for {src}!'
                annotations_q_type.append(annotation)

            with open(path_type_folder_ims_annotations, 'w') as f:
                json.dump(annotations_q_type, f, ensure_ascii=False)

## Test sample mini

In [None]:
path_test_dataset_mini = Path('./test_dataset_mini')
path_test_dataset_mini

In [None]:
if path_test_dataset_mini.exists():
    shutil.rmtree(path_test_dataset_mini)
path_test_dataset_mini.mkdir()


def get_annotation_file(cropped_image: Path, language: str, subject: str) -> dict | None:
    for annotation_file in annotations:
        current_annotations = json.loads(annotation_file.read_bytes())
        for idx, current_annotation in enumerate(current_annotations):
            if current_annotation['info']['language'] == language \
                and current_annotation['info']['subject'] == subject:
                if str(cropped_image) == preprocess(current_annotation['question']['question_snapshot']):
                    return current_annotations[idx]
    return None


types = ['image_text', 'text']
languages = df_test_mini['language'].unique()

for language in tqdm(languages):
    path_lang_folder = path_test_dataset_mini / language
    path_lang_folder.mkdir()
    
    subjects_agg = df_test_mini.query(f'language == "{language}"')['subject_aggregated'].unique()

    for subject_agg in subjects_agg:
        path_subj_folder = path_lang_folder / subject_agg
        path_subj_folder.mkdir()

        for q_type in types:
            annotations_q_type = []
            path_type_folder = path_subj_folder / q_type
            path_type_folder_ims = path_subj_folder / q_type / 'images'
            path_type_folder_ims_annotations = path_subj_folder / q_type / 'annotations.json'
            path_type_folder.mkdir()
            path_type_folder_ims.mkdir()

            query = f'language == "{language}" and subject_aggregated == "{subject_agg}" and type == "{q_type}"'
            df_subset = df_test_mini.query(query)

            for idx, row in df_subset.iterrows():
                src = Path(row['question_snapshot'])
                subject = row['subject']
                dst = path_type_folder_ims / f'{idx:02}_{src.name}'

                assert src.exists(), f'ERROR: File {src} not found!'

                shutil.copyfile(src, dst)

                annotation = get_annotation_file(src, language, subject)
                assert annotation is not None, f'ERROR: No annotation file for {src}!'
                annotations_q_type.append(annotation)

            with open(path_type_folder_ims_annotations, 'w') as f:
                json.dump(annotations_q_type, f, ensure_ascii=False)

# Send to BARD

In [167]:
session = requests.Session()

# __Secure-1PSID token
token1 = "dQhzAC701XnEp4srLnf3Pk0EHjl8tq-Og4JKQ14EvYSZ0RomEP-Ggsq4F1IH8goNIhPqBw."

# __Secure-1PSIDCC token
token2 = "ABTWhQHXt63d4S9fxRW9IxNP3seeLhhVjiLxILYATtkbO8j_htbF_5Vp6uq7gRFkppbUwziAgrja"

# __Secure-1PSIDTS token
token3 = "sidts-CjEBPVxjSmQx_3nNH7vfRtjJzWlIq0NOMzGlzv801vGdL52Evqrzj0IV_3EZ59-r-56YEAA"

session.cookies.set("__Secure-1PSID", token1)
session.cookies.set("__Secure-1PSIDCC", token2)
session.cookies.set("__Secure-1PSIDTS", token3)

session.headers = SESSION_HEADERS

bard = Bard(token=token1, session=session, timeout=300)
# result = bard.get_answer("How is the weather today in Seoul?")
# print(result) 

START_INDEX = 1803
NUM_TO_PASS_TO_BARD = 89
PROMPT = "The image has an multiple choice question with 3 to 5 choices. Provide the correct answer precisely in JSON format as follows: {'answer': 'xxx'}. Replace 'xxx' with the appropriate letter: 'A' if the first choice is correct, 'B' if the second choice is correct, 'C' if the third choice is correct, 'D' if the fourth choice is correct, or 'E' if the fifth choice is correct."

df_bard = df_test.iloc[range(START_INDEX, START_INDEX + NUM_TO_PASS_TO_BARD)]
df_bard['content'] = 'NA'

bard_answers = []
for idx, row in tqdm(df_bard.iterrows()):
    with open(row['question_snapshot'], 'rb') as image:
        bard_answer = bard.ask_about_image(PROMPT, image.read())
        # print(bard_answer['content'][:10])
        bard_answers.append(bard_answer)
        df_bard.at[idx, 'content'] = bard_answer['content']
df_bard

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bard['content'] = 'NA'
89it [17:11, 11.60s/it]


Unnamed: 0,language,subject_aggregated,type,id,answer_key,question_snapshot,question_number,grade,subject,date,subject_aggregated_grouped,content
1714,Croatian,Psychology,text,26dbf0c3-fb58-4fda-aff6-8c9afde2dfb1,C,../multiling-multichoice-rc-master/dataset/Cro...,5,12,Psychology,2014-06-10,Social Sciences,The correct answer to the multiple choice ques...
1715,Croatian,Psychology,text,dec74646-4d00-4bb7-ad6f-28115b26f011,B,../multiling-multichoice-rc-master/dataset/Cro...,3,12,Psychology,2019-06,Social Sciences,{'answer': 'B'}
1716,Croatian,Psychology,text,ce63a9c1-e210-47f4-b887-98c8719f695c,C,../multiling-multichoice-rc-master/dataset/Cro...,18,12,Psychology,2017-06,Social Sciences,"The correct answer is A, visoka toplina i viso..."
1717,Croatian,Psychology,text,e8f8e669-2d31-4b12-b9d8-3fe99335d26a,A,../multiling-multichoice-rc-master/dataset/Cro...,10,12,Psychology,2016-04-06,Social Sciences,{'answer': 'A'}
1718,Croatian,Psychology,text,5b0c4ec7-5c4b-4b9c-ad34-91fa67807e66,C,../multiling-multichoice-rc-master/dataset/Cro...,13,12,Psychology,2017-06,Social Sciences,{'answer': 'D'}
...,...,...,...,...,...,...,...,...,...,...,...,...
1798,Croatian,Sociology,text,70fe836f-2dba-467f-bebc-1dcbcf6c3b60,D,../multiling-multichoice-rc-master/dataset/Cro...,23,12,Sociology,2015-07-01,Social Sciences,The correct answer is **A**.\n\n**JSON format:...
1799,Croatian,Sociology,text,a4da5c61-c462-4065-ace7-670e49cc3e27,B,../multiling-multichoice-rc-master/dataset/Cro...,20,12,Sociology,2016-06-30,Social Sciences,The correct answer to the multiple choice ques...
1800,Croatian,Sociology,text,d0a44e6e-ce7a-4afd-9926-e44d23e52397,C,../multiling-multichoice-rc-master/dataset/Cro...,27,12,Sociology,2015-07-01,Social Sciences,The correct answer to the multiple choice ques...
1801,Croatian,Sociology,text,b75447be-6c92-4d46-b87c-e67ad80ee93c,D,../multiling-multichoice-rc-master/dataset/Cro...,10,12,Sociology,2018-09,Social Sciences,The answer to the multiple choice question in ...


In [168]:
df_bard.to_csv('BARD_035_gemini.csv', index=False)