In [None]:
import json
from itertools import islice

def read_partial_json(file_path, num_lines=10):
    with open(file_path, 'r') as f:
        # Read the first `num_lines` lines to inspect the structure
        partial_content = "".join(list(islice(f, num_lines)))
    
    # Try to load it as JSON to see if it's valid and inspect it
    try:
        parsed_json = json.loads(partial_content)
        print(json.dumps(parsed_json, indent=4))
    except json.JSONDecodeError:
        print("The selected lines are not a complete or valid JSON snippet.")
        print(partial_content)

# Use the function with the path to your JSON file
read_partial_json('/home/emilie/projects/val.json', num_lines=20)


In [None]:
{
    "info": {
        "year": 2021,
        "verion": 1,
        "description": "iNaturalist Species Classification Dataset Testing Split.",
        "contributor": "Grant Van Horn and the Visipedia Team.",
        "url": "https://github.com/visipedia/inat_comp",
        "date_created": "2021-03-01 12:34:38"
    },
    "images": [
        {
            "id": 2686843,
            "width": 284,
            "height": 222,
            "file_name": "val/03938_Animalia_Chordata_Aves_Passeriformes_Meliphagidae_Ptilotula_penicillata/df8edd4c-fbb4-4886-8600-a429e5efac23.jpg",
            "license": 2,
            "rights_holder": "megatherium",
            "date": "2007-10-31 00:00:00+00:00",
            "latitude": -21.93073,
            "longitude": 114.12239,
            "location_uncertainty": null
        },
        {
            "id": 2686844,
            "width": 500,
            "height": 375,
            "file_name": "val/03583_Animalia_Chordata_Aves_Cuculiformes_Cuculidae_Coccyzus_erythropthalmus/fc35080c-5ace-4485-a21f-b1447f27efc7.jpg",
            "license": 1,
            "rights_holder": "rpayne",
            "date": "2011-07-15 00:00:00+00:00",
            "latitude": 44.02901,
            "longitude": -73.17711,
            "location_uncertainty": null
        }],
    "categories": [
        {
            "id": 0,
            "name": "Lumbricus terrestris",
            "common_name": "Common Earthworm",
            "supercategory": "Animalia",
            "kingdom": "Animalia",
            "phylum": "Annelida",
            "class": "Clitellata",
            "order": "Haplotaxida",
            "family": "Lumbricidae",
            "genus": "Lumbricus",
            "specific_epithet": "terrestris",
            "image_dir_name": "00000_Animalia_Annelida_Clitellata_Haplotaxida_Lumbricidae_Lumbricus_terrestris"
        },
        {
            "id": 1,
            "name": "Sabella spallanzanii",
            "common_name": "Mediterranean Fanworm",
            "supercategory": "Animalia",
            "kingdom": "Animalia",
            "phylum": "Annelida",
            "class": "Polychaeta",
            "order": "Sabellida",
            "family": "Sabellidae",
            "genus": "Sabella",
            "specific_epithet": "spallanzanii",
            "image_dir_name": "00001_Animalia_Annelida_Polychaeta_Sabellida_Sabellidae_Sabella_spallanzanii"
        }
    ],
    "annotations": [
        {
            "id": 2686843,
            "image_id": 2686843,
            "category_id": 3938
        },
        {
            "id": 2686844,
            "image_id": 2686844,
            "category_id": 3583
        },
        {
            "id": 2686845,
            "image_id": 2686845,
            "category_id": 5585
        },
        {
            "id": 2686846,
            "image_id": 2686846,
            "category_id": 4487
        }]
}


In [3]:
import pandas as pd
import json

# Step 1: Load the JSON data
with open('/home/emilie/projects/train.json', 'r') as file:
    data = json.load(file)

# Step 2: Extract images, categories, and annotations sections
images = data['images']
categories = data['categories']
annotations = data['annotations']

# Step 3: Convert each section into a pandas DataFrame
df_images = pd.DataFrame(images)
df_categories = pd.DataFrame(categories)
df_annotations = pd.DataFrame(annotations)


In [4]:
df_images.head()

Unnamed: 0,id,width,height,file_name,license,rights_holder,date,latitude,longitude,location_uncertainty
0,0,500,500,train/02912_Animalia_Chordata_Actinopterygii_S...,0,Ken-ichi Ueda,2010-07-14 20:19:00+00:00,43.83486,-71.22231,77.0
1,1,500,333,train/04831_Animalia_Chordata_Mammalia_Rodenti...,0,Michelle S. Koo,2010-07-06 22:17:00+00:00,38.86995,-120.19326,
2,2,500,375,train/05015_Animalia_Chordata_Reptilia_Squamat...,1,105615097470186309865,2009-05-04 00:00:00+00:00,35.14218,-116.10415,
3,3,500,375,train/05163_Animalia_Chordata_Reptilia_Testudi...,1,biosam,2009-05-04 00:00:00+00:00,35.09829,-116.02979,28734.0
4,4,500,375,train/04983_Animalia_Chordata_Reptilia_Squamat...,1,biosam,2009-05-05 00:00:00+00:00,35.01099,-115.47336,


In [5]:
df_images['file_name'][0]

'train/02912_Animalia_Chordata_Actinopterygii_Siluriformes_Ictaluridae_Ameiurus_nebulosus/d615f184-8af4-4c60-b9f8-3081c1607644.jpg'

In [6]:
df_categories.head()

Unnamed: 0,id,name,common_name,supercategory,kingdom,phylum,class,order,family,genus,specific_epithet,image_dir_name
0,0,Lumbricus terrestris,Common Earthworm,Animalia,Animalia,Annelida,Clitellata,Haplotaxida,Lumbricidae,Lumbricus,terrestris,00000_Animalia_Annelida_Clitellata_Haplotaxida...
1,1,Sabella spallanzanii,Mediterranean Fanworm,Animalia,Animalia,Annelida,Polychaeta,Sabellida,Sabellidae,Sabella,spallanzanii,00001_Animalia_Annelida_Polychaeta_Sabellida_S...
2,2,Serpula columbiana,Serpula columbiana,Animalia,Animalia,Annelida,Polychaeta,Sabellida,Serpulidae,Serpula,columbiana,00002_Animalia_Annelida_Polychaeta_Sabellida_S...
3,3,Spirobranchus cariniferus,Blue Tube Worm,Animalia,Animalia,Annelida,Polychaeta,Sabellida,Serpulidae,Spirobranchus,cariniferus,00003_Animalia_Annelida_Polychaeta_Sabellida_S...
4,4,Eratigena duellica,Giant House Spider,Arachnids,Animalia,Arthropoda,Arachnida,Araneae,Agelenidae,Eratigena,duellica,00004_Animalia_Arthropoda_Arachnida_Araneae_Ag...


In [34]:
df_categories['specific_epithet'].nunique()

6485

In [74]:
# Create the DF with all info useful for storing:
data = (df_annotations[['image_id', 'category_id']]
        .merge(
            df_images[['id', 'rights_holder', 'width', 'height', 'file_name']],
            right_on='id', left_on='image_id', how='inner')
        .drop('id', axis=1)
        .merge(df_categories[['id', 'phylum', 'class', 'order', 'name']],
               right_on='id', left_on='category_id', how='inner')
        .drop(['id', 'category_id'], axis=1)
        .rename(columns = {'name': 'species',
                 'rights_holder': 'user'})
)

In [75]:
data.to_csv('training_image_info.csv', index=False)

In [76]:
[', '.join(data.columns) ]

['image_id, user, width, height, file_name, phylum, class, order, species']

In [77]:
data.columns

Index(['image_id', 'user', 'width', 'height', 'file_name', 'phylum', 'class',
       'order', 'species'],
      dtype='object')

In [78]:
data

Unnamed: 0,image_id,user,width,height,file_name,phylum,class,order,species
0,0,Ken-ichi Ueda,500,500,train/02912_Animalia_Chordata_Actinopterygii_S...,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
1,20217,103744238038089080957,500,375,train/02912_Animalia_Chordata_Actinopterygii_S...,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
2,20305,sallen,500,334,train/02912_Animalia_Chordata_Actinopterygii_S...,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
3,21063,lselig,375,500,train/02912_Animalia_Chordata_Actinopterygii_S...,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
4,38313,Ray,500,375,train/02912_Animalia_Chordata_Actinopterygii_S...,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
...,...,...,...,...,...,...,...,...,...
2686838,2618323,Alexander Baransky,500,333,train/08880_Plantae_Tracheophyta_Magnoliopsida...,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra
2686839,2630378,Mykyta Peregrym,333,500,train/08880_Plantae_Tracheophyta_Magnoliopsida...,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra
2686840,2667669,Alexander Baransky,375,500,train/08880_Plantae_Tracheophyta_Magnoliopsida...,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra
2686841,2673270,natalia_gamova,500,375,train/08880_Plantae_Tracheophyta_Magnoliopsida...,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra


In [82]:
# Create the smallest possible DF with all info I need:
data_minimal = data.drop(['width', 'height',
       'file_name'], axis=1)

In [83]:
data_minimal

Unnamed: 0,image_id,user,phylum,class,order,species
0,0,Ken-ichi Ueda,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
1,20217,103744238038089080957,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
2,20305,sallen,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
3,21063,lselig,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
4,38313,Ray,Chordata,Actinopterygii,Siluriformes,Ameiurus nebulosus
...,...,...,...,...,...,...
2686838,2618323,Alexander Baransky,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra
2686839,2630378,Mykyta Peregrym,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra
2686840,2667669,Alexander Baransky,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra
2686841,2673270,natalia_gamova,Tracheophyta,Magnoliopsida,Malpighiales,Salix triandra


In [84]:
images_per_user = data_minimal.groupby('user')['image_id'].count()
categories_per_users = data_minimal.groupby('user')['species'].nunique()
phylum_per_user = data_minimal.groupby('user')['phylum'].nunique()
class_per_user = data_minimal.groupby('user')['class'].nunique()
order_per_user = data_minimal.groupby('user')['order'].nunique()

In [88]:
info  = pd.concat([images_per_user,phylum_per_user, class_per_user, order_per_user, categories_per_users], 
                  axis=1).sort_values('image_id', ascending = False).reset_index().rename(columns={
                    'image_id':'number_of_images',
                    'species':'number_of_unique_species',
                    'phylum':'number_of_unique_phylums',
                    'class':'number_of_unique_classes',
                    'order':'number_of_unique_orders'  
                  })

In [89]:
info

Unnamed: 0,user,number_of_images,number_of_unique_phylums,number_of_unique_classes,number_of_unique_orders,number_of_unique_species
0,James Bailey,7086,13,41,185,3102
1,QuestaGame,6764,10,32,131,1251
2,Reiner Richter,5952,8,23,91,532
3,Sam Kieschnick,5706,9,27,138,1984
4,sea-kangaroo,4829,13,44,191,2262
...,...,...,...,...,...,...
158868,loleksik22,1,1,1,1,1
158869,lolbisante,1,1,1,1,1
158870,lolbeandip,1,1,1,1,1
158871,brennen05,1,1,1,1,1


In [91]:
info.to_csv('training_images_and_categories_per_users.csv')

In [15]:
info['more_than_one_pict'] = info['image_id'] / info['category_id']

In [16]:
info.sort_values('more_than_one_pict', ascending = False)

Unnamed: 0_level_0,image_id,category_id,more_than_one_pict
rights_holder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Marc Nishimoto,102,1,102.0
frecam1973,68,1,68.0
tilapiamap,40,1,40.0
paulamacdonald,38,1,38.0
ralphyk,37,1,37.0
...,...,...,...
deebsthreebs,1,1,1.0
deebsn60,1,1,1.0
deeboswell,1,1,1.0
deeboraester,1,1,1.0


In [27]:
# do some images have more than one classes?
df_annotations.groupby('image_id')['id'].count().sort_values()

image_id
0          1
1791224    1
1791225    1
1791226    1
1791227    1
          ..
895616     1
895617     1
895618     1
895600     1
2686842    1
Name: id, Length: 2686843, dtype: int64

In [24]:
data_minimal.groupby('category_id')['image_id'].count().sort_values()

category_id
9980    152
720     152
1471    152
2065    152
3943    152
       ... 
4179    300
4178    300
4177    300
4189    300
9999    300
Name: image_id, Length: 10000, dtype: int64

In [None]:
# re-do the info csv with phylum and classes

