JSON

In [3]:
# A partir d'un JSON sous format texte (ou JSON string) : 
obj = """ 
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"], 
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [4]:
# Créer un objet JSON à partir d'une string
import json

result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [5]:
# Créer un objet JSON à partir d'un fichier
result_open = json.load(open("../data/json_ex.json"))
result_open

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [6]:
import pandas as pd

pd.DataFrame(result['siblings'], columns = ["name", "age"])

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


Exemple à partir des données FoodFact du gouvernement américain

In [7]:
# A télécharger depuis 
# https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_foundation_food_json_2023-10-26.zip
# à dézipper, placer dans data, et renommer en food_fact.json
db = json.load(open("../data/food_fact.json"))

In [8]:
db['FoundationFoods'][0]

{'foodClass': 'FinalFood',
 'description': 'Hummus, commercial',
 'foodNutrients': [{'type': 'FoodNutrient',
   'id': 2219707,
   'nutrient': {'id': 1120,
    'number': '334',
    'name': 'Cryptoxanthin, beta',
    'rank': 7460,
    'unitName': 'µg'},
   'dataPoints': 1,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'median': 3.0,
   'amount': 3.0},
  {'type': 'FoodNutrient',
   'id': 2219708,
   'nutrient': {'id': 1122,
    'number': '337',
    'name': 'Lycopene',
    'rank': 7530,
    'unitName': 'µg'},
   'dataPoints': 1,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'median': 0.0,
   'amount': 0.0},
  {'type': 'FoodNutrient',
   'id': 2219709,
   'nutrient': {'id': 1127,
    'number': '3

In [9]:
# Cherchons les aliments qui contiennent des oeufs
elts_with_egg = [elt for elt in db['FoundationFoods'] if "egg" in elt["description"].lower()]
len(elts_with_egg)

9

In [10]:
raw_egg = elts_with_egg[0]
raw_egg

{'foodClass': 'FinalFood',
 'description': 'Egg, whole, raw, frozen, pasteurized',
 'foodNutrients': [{'type': 'FoodNutrient',
   'id': 2229844,
   'nutrient': {'id': 1090,
    'number': '304',
    'name': 'Magnesium, Mg',
    'rank': 5500,
    'unitName': 'mg'},
   'dataPoints': 14,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'max': 12.4,
   'min': 10.1,
   'median': 11.2,
   'amount': 11.2},
  {'type': 'FoodNutrient',
   'id': 2229845,
   'nutrient': {'id': 1091,
    'number': '305',
    'name': 'Phosphorus, P',
    'rank': 5600,
    'unitName': 'mg'},
   'dataPoints': 14,
   'foodNutrientDerivation': {'code': 'A',
    'description': 'Analytical',
    'foodNutrientSource': {'id': 1,
     'code': '1',
     'description': 'Analytical or derived from analytical'}},
   'max': 208,
   'min': 178,
   'median': 188,
   'amount': 189},
  {'ty

In [40]:
# quels sont les nutriments décrits dans le json ci-dessus ? 
# Chercher dans le champ foodNutrients le champ "name" associé à chaque "nutrient"
# Rassembler l'ensemble de ces "names" dans une liste
[elt["nutrient"]["name"] for elt in raw_egg["foodNutrients"]]

['Magnesium, Mg',
 'Phosphorus, P',
 'Manganese, Mn',
 'Potassium, K',
 'Zinc, Zn',
 'Sodium, Na',
 'Iron, Fe',
 'Energy',
 'Protein',
 'Ash',
 'Vitamin D2 (ergocalciferol)',
 'Water',
 'Calcium, Ca',
 'Total lipid (fat)',
 'Carbohydrate, by difference',
 'Energy',
 'Copper, Cu',
 'Vitamin D3 (cholecalciferol)',
 'Vitamin D (D2 + D3)',
 'Vitamin D (D2 + D3), International Units',
 'Cholesterol',
 '25-hydroxycholecalciferol',
 'Nitrogen',
 'Iodine, I']

In [58]:
import pandas as pd
nutrients_info_in_egg = pd.DataFrame(raw_egg["foodNutrients"])
nutrients_info_in_egg

Unnamed: 0,type,id,nutrient,dataPoints,foodNutrientDerivation,max,min,median,amount
0,FoodNutrient,2229844,"{'id': 1090, 'number': '304', 'name': 'Magnesi...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",12.4,10.1,11.2,11.2
1,FoodNutrient,2229845,"{'id': 1091, 'number': '305', 'name': 'Phospho...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",208.0,178.0,188.0,189.0
2,FoodNutrient,2229846,"{'id': 1101, 'number': '315', 'name': 'Mangane...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.0,0.0,0.0,0.0
3,FoodNutrient,2229847,"{'id': 1092, 'number': '306', 'name': 'Potassi...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",125.0,109.0,117.0,117.0
4,FoodNutrient,2229848,"{'id': 1095, 'number': '309', 'name': 'Zinc, Z...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",1.32,1.06,1.21,1.2
5,FoodNutrient,2229849,"{'id': 1093, 'number': '307', 'name': 'Sodium,...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",137.0,100.0,121.0,121.0
6,FoodNutrient,2229850,"{'id': 1089, 'number': '303', 'name': 'Iron, F...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",2.01,1.56,1.76,1.77
7,FoodNutrient,2229851,"{'id': 1062, 'number': '268', 'name': 'Energy'...",,"{'code': 'NC', 'description': 'Calculated', 'f...",,,,627.0
8,FoodNutrient,2229852,"{'id': 1003, 'number': '203', 'name': 'Protein...",,"{'code': 'NC', 'description': 'Calculated', 'f...",12.6,11.8,12.3,12.3
9,FoodNutrient,2229853,"{'id': 1007, 'number': '207', 'name': 'Ash', '...",14.0,"{'code': 'A', 'description': 'Analytical', 'fo...",1.28,1.0,1.18,1.16


In [61]:
# Ce n'est pas très lisible
# Faire un dataframe à partir du champ nutrient dans le tableau ci-dessus
egg_nutrients = pd.DataFrame([elt["nutrient"] for elt in raw_egg["foodNutrients"]])
egg_nutrients

Unnamed: 0,id,number,name,rank,unitName
0,1090,304,"Magnesium, Mg",5500,mg
1,1091,305,"Phosphorus, P",5600,mg
2,1101,315,"Manganese, Mn",6100,mg
3,1092,306,"Potassium, K",5700,mg
4,1095,309,"Zinc, Zn",5900,mg
5,1093,307,"Sodium, Na",5800,mg
6,1089,303,"Iron, Fe",5400,mg
7,1062,268,Energy,400,kJ
8,1003,203,Protein,600,g
9,1007,207,Ash,1000,g


In [62]:
# Il manque la quantité associée ! Comment la rajouter ? 
egg_nutrients["amount"] = nutrients_info_in_egg["amount"]
egg_nutrients

Unnamed: 0,id,number,name,rank,unitName,amount
0,1090,304,"Magnesium, Mg",5500,mg,11.2
1,1091,305,"Phosphorus, P",5600,mg,189.0
2,1101,315,"Manganese, Mn",6100,mg,0.0
3,1092,306,"Potassium, K",5700,mg,117.0
4,1095,309,"Zinc, Zn",5900,mg,1.2
5,1093,307,"Sodium, Na",5800,mg,121.0
6,1089,303,"Iron, Fe",5400,mg,1.77
7,1062,268,Energy,400,kJ,627.0
8,1003,203,Protein,600,g,12.3
9,1007,207,Ash,1000,g,1.16


In [45]:
# retour sur le json db
info = pd.DataFrame(db["FoundationFoods"], columns=["description", "foodCategory", "scientificName"])
info

Unnamed: 0,description,foodCategory,scientificName
0,"Hummus, commercial",{'description': 'Legumes and Legume Products'},
1,"Tomatoes, grape, raw",{'description': 'Vegetables and Vegetable Prod...,Solanum lycopersicum
2,"Beans, snap, green, canned, regular pack, drai...",{'description': 'Vegetables and Vegetable Prod...,
3,"Frankfurter, beef, unheated",{'description': 'Sausages and Luncheon Meats'},
4,"Nuts, almonds, dry roasted, with salt added",{'description': 'Nut and Seed Products'},
...,...,...,...
260,"Cheese, pasteurized process cheese food or pro...",{'description': 'Dairy and Egg Products'},
261,"Cheese, provolone, sliced",{'description': 'Dairy and Egg Products'},
262,"Cheese, oaxaca, solid",{'description': 'Dairy and Egg Products'},
263,"Cheese, queso fresco, solid",{'description': 'Dairy and Egg Products'},


In [46]:
# transformer la colonne foodCategory pour qu'elle ne contienne pas un dictionnaire mais 
# directement une valeur
info["foodCategory"] = info["foodCategory"].apply(lambda x: x["description"])
info


Unnamed: 0,description,foodCategory,scientificName
0,"Hummus, commercial",Legumes and Legume Products,
1,"Tomatoes, grape, raw",Vegetables and Vegetable Products,Solanum lycopersicum
2,"Beans, snap, green, canned, regular pack, drai...",Vegetables and Vegetable Products,
3,"Frankfurter, beef, unheated",Sausages and Luncheon Meats,
4,"Nuts, almonds, dry roasted, with salt added",Nut and Seed Products,
...,...,...,...
260,"Cheese, pasteurized process cheese food or pro...",Dairy and Egg Products,
261,"Cheese, provolone, sliced",Dairy and Egg Products,
262,"Cheese, oaxaca, solid",Dairy and Egg Products,
263,"Cheese, queso fresco, solid",Dairy and Egg Products,


In [47]:
info.value_counts(info.foodCategory)

foodCategory
Vegetables and Vegetable Products    52
Dairy and Egg Products               42
Legumes and Legume Products          37
Fruits and Fruit Juices              31
Cereal Grains and Pasta              27
Nut and Seed Products                19
Beef Products                        12
Fats and Oils                         9
Poultry Products                      7
Sausages and Luncheon Meats           6
Pork Products                         5
Restaurant Foods                      4
Finfish and Shellfish Products        3
Beverages                             3
Baked Products                        3
Soups, Sauces, and Gravies            2
Spices and Herbs                      2
Sweets                                1
Name: count, dtype: int64

In [54]:
# Gros chantier : tentative de rassembler l'ensemble des nutriments dans une table
nutrients = []
for rec in db["FoundationFoods"]:
    fnuts = pd.DataFrame(rec['foodNutrients']) 
    fnuts['id'] = rec['fdcId'] 
    nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
nutrients

Unnamed: 0,type,id,nutrient,dataPoints,foodNutrientDerivation,median,amount,max,min,footnote
0,FoodNutrient,321358,"{'id': 1120, 'number': '334', 'name': 'Cryptox...",1.0,"{'code': 'A', 'description': 'Analytical', 'fo...",3.00,3.0,,,
1,FoodNutrient,321358,"{'id': 1122, 'number': '337', 'name': 'Lycopen...",1.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.00,0.0,,,
2,FoodNutrient,321358,"{'id': 1127, 'number': '343', 'name': 'Tocophe...",6.0,"{'code': 'A', 'description': 'Analytical', 'fo...",1.21,1.3,2.47,0.26,
3,FoodNutrient,321358,"{'id': 1130, 'number': '346', 'name': 'Tocotri...",6.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.00,0.0,0.00,0.00,
4,FoodNutrient,321358,"{'id': 1131, 'number': '347', 'name': 'Tocotri...",6.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.00,0.0,0.00,0.00,
...,...,...,...,...,...,...,...,...,...,...
12908,FoodNutrient,2647443,"{'id': 2066, 'number': '333', 'name': 'Vitamin...",,{'foodNutrientSource': {}},,,,,
12909,FoodNutrient,2647443,"{'id': 1114, 'number': '328', 'name': 'Vitamin...",,"{'code': 'NC', 'description': 'Calculated', 'f...",,0.0,,,
12910,FoodNutrient,2647443,"{'id': 2047, 'number': '957', 'name': 'Energy ...",,"{'code': 'NC', 'description': 'Calculated', 'f...",,351.0,,,
12911,FoodNutrient,2647443,"{'id': 2048, 'number': '958', 'name': 'Energy ...",,"{'code': 'NC', 'description': 'Calculated', 'f...",,352.0,,,


In [51]:
# La colonne type est-elle utile ? 
nutrients.value_counts(nutrients.type)

type
FoodNutrient    12913
Name: count, dtype: int64

In [55]:
nutrients = nutrients.drop(["type", "footnote", "min", "max", "median"], axis=1)
nutrients

Unnamed: 0,id,nutrient,dataPoints,foodNutrientDerivation,amount
0,321358,"{'id': 1120, 'number': '334', 'name': 'Cryptox...",1.0,"{'code': 'A', 'description': 'Analytical', 'fo...",3.0
1,321358,"{'id': 1122, 'number': '337', 'name': 'Lycopen...",1.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.0
2,321358,"{'id': 1127, 'number': '343', 'name': 'Tocophe...",6.0,"{'code': 'A', 'description': 'Analytical', 'fo...",1.3
3,321358,"{'id': 1130, 'number': '346', 'name': 'Tocotri...",6.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.0
4,321358,"{'id': 1131, 'number': '347', 'name': 'Tocotri...",6.0,"{'code': 'A', 'description': 'Analytical', 'fo...",0.0
...,...,...,...,...,...
12908,2647443,"{'id': 2066, 'number': '333', 'name': 'Vitamin...",,{'foodNutrientSource': {}},
12909,2647443,"{'id': 1114, 'number': '328', 'name': 'Vitamin...",,"{'code': 'NC', 'description': 'Calculated', 'f...",0.0
12910,2647443,"{'id': 2047, 'number': '957', 'name': 'Energy ...",,"{'code': 'NC', 'description': 'Calculated', 'f...",351.0
12911,2647443,"{'id': 2048, 'number': '958', 'name': 'Energy ...",,"{'code': 'NC', 'description': 'Calculated', 'f...",352.0


In [57]:
nutrients.loc[0, "nutrient"]

{'id': 1120,
 'number': '334',
 'name': 'Cryptoxanthin, beta',
 'rank': 7460,
 'unitName': 'µg'}

In [None]:
# Chantier à poursuivre
# Idées d'exploration : 
# - finir de constituer le dataframe des nutriments, en supprimant les doublons
# - faire une jointure avec le dataframe des groupes constitués plus haut
# - représenter graphiquement la quantité présente d'un nutriment donné, pour chauqe groupe
# - représenter graphiquement dans quel aliment on trouve la plus grande quantité de chaque nutriment