# Purpose of Preprocessing
- Unify nutrient units

In [4]:
import os
import json
import re

# 0 ) Import data

In [5]:
vit_path = '../output_processed/vitamins/'
min_path = '../output_processed/minerals/'
amino_path = '../output_processed/amino-acids/'

vit_list = [i for i in os.listdir(vit_path) if i !='.DS_Store']
min_list = [i for i in os.listdir(min_path) if i !='.DS_Store']
amino_list = [i for i in os.listdir(amino_path) if i !='.DS_Store']

# 1 ) get_nutri_key
- Get total nutrients name

In [6]:
def get_nutri_key(prod_path, prod_list):
    total_nutri_list = []
    
    for prod in prod_list:
        with open(prod_path+prod, encoding = 'utf-8-sig') as f:
            prod_data = json.load(f)
            prod_nutri_list = list(prod_data['nutrients'].keys())
            total_nutri_list.extend(prod_nutri_list)
            
    total_nutri_list = sorted(list(set(total_nutri_list)))
    return total_nutri_list

In [7]:
vit_nutri_list = get_nutri_key(vit_path, vit_list)
min_nutri_list = get_nutri_key(min_path, min_list)
amino_nutri_list = get_nutri_key(amino_path, amino_list)

In [8]:
total_nutri_list = vit_nutri_list + min_nutri_list + amino_nutri_list # 모든 nutrients 명이 담겨있는 리스트
total_nutri_list = list(set(total_nutri_list))
print(len(total_nutri_list))

['Niacin', 'Dietary_Fiber', 'Valine', 'Folic_acid', 'Vitamin_B12', 'Phenylalanine', 'Selenium', 'Magnesium', 'Vitamin_E', 'Vitamin_C', 'Cholesterol', 'Lysine', 'Iso_Leucine', 'Sodium', 'Methionine', 'Iron', 'Phosphorus', 'Vitamin_K', 'Zinc', 'Fat', 'Histidine', 'Leucine', 'Copper', 'Alpha_Linolenic_Acid', 'Vitamin_D', 'Vitamin_B6', 'Tyrosine', 'Threonine', 'Manganese', 'Linoleic_Acid', 'Carbohydrate', 'Potassium', 'Protein', 'Calcium', 'Vitamin_A']


In [9]:
len(total_nutri_list)

35

# 2 ) get_nutri_unit
- Get list of units of each nutrients

In [10]:
def get_nutri_unit(prod_path, prod_list, nutri_unit_dict):

    for prod in prod_list:

        with open(prod_path+prod, encoding = 'utf-8-sig') as f:
            prod_data = json.load(f)

        for nutri in list(prod_data['nutrients'].keys()):
            if nutri in list(nutri_unit_dict.keys()):
                nutri_unit_dict[nutri].append(prod_data['nutrients'][nutri][1])
            else:
                nutri_unit_dict[nutri] = [prod_data['nutrients'][nutri][1]]
    nutri_unit_dict = {i:list(set(j)) for i,j in nutri_unit_dict.items()}
    return nutri_unit_dict

In [11]:
nutri_unit_dict = dict()
nutri_unit_dict = get_nutri_unit(vit_path, vit_list, nutri_unit_dict)
nutri_unit_dict = get_nutri_unit(min_path, min_list, nutri_unit_dict)
nutri_unit_dict = get_nutri_unit(amino_path, amino_list, nutri_unit_dict)

- diff_units: Nutrients with multiple units

In [12]:
diff_units = {i:j for i,j in nutri_unit_dict.items() if len(j)!=1} # 단위 여러개인 영양소들
diff_units 

{'Fat': ['mg', 'g'],
 'Carbohydrate': ['mg', 'g미만', 'g'],
 'Vitamin_A': ['mg',
  'µg',
  'ugRAE',
  'µgRAE',
  'mcgREA',
  'IU',
  'mgRAE',
  'IU3166IU',
  'mcg',
  'mcgRAE',
  'I'],
 'Vitamin_C': ['mg', 'mg220mg', 'mg11mg', 'g'],
 'Vitamin_D': ['mg',
  'iu',
  'µg',
  'IU25mcg',
  '5',
  'g',
  'IU',
  'mcgIU',
  'mcg400IU',
  'mcg',
  '7',
  'ug',
  'I'],
 'Vitamin_E': ['mg', 'mgAT', 'mg4mg', 'a', 'IU', 'mcg', 'I'],
 'Vitamin_K': ['mg',
  'µg',
  'mcg2000mcg1500mcg11mcg181mcg43mcg',
  'μg',
  'mcg',
  'mcg90mcg80mcg'],
 'Vitamin_B6': ['mg', 'mcg', 'g'],
 'Folic_acid': ['mg',
  'ugDFE',
  'mcgDFE엽산',
  'µgDFE',
  'mcgº',
  'mcgDFE',
  'mcgDEF',
  'µg',
  'mcg엽산',
  'mcgDFE엽산400mcg',
  'mcgDFEL',
  'mgDFE',
  'mcg',
  'mcg식이엽산당량',
  'ug',
  'mcgDFE엽산240mcg'],
 'Vitamin_B12': ['mg', 'µg', 'mcgDFE', 'μg', 'mcg', 'ug'],
 'Calcium': ['mg', 'g', 'mcg', 'mg6mg'],
 'Iron': ['mg', 'mcg', 'g'],
 'Magnesium': ['mg', 'mcg', 'g'],
 'Zinc': ['mg', 'mcg'],
 'Selenium': ['mg', 'µg', 'mcg'],
 'Copper'

- same_units: Nutrients with one unit

In [13]:
same_units = {i:j for i,j in nutri_unit_dict.items() if len(j)==1} # 단위가 하나로 통일된 영양소들
same_units

{'Phosphorus': ['mg'],
 'Tyrosine': ['mg'],
 'Alpha_Linolenic_Acid': ['mg'],
 'Histidine': ['mg'],
 'Threonine': ['mg'],
 'Linoleic_Acid': ['mg']}

# 3 ) Conversion to µg 
- mcg, ug -> µg
- Leave English only

In [14]:
def to_ug(json_data):
    nutri_list = list(json_data['nutrients'].keys())
    for nutri in nutri_list:
        if 'mcg' in json_data['nutrients'][nutri][1] or 'µg' in json_data['nutrients'][nutri][1]:
            json_data['nutrients'][nutri][1] = re.sub('mcg', 'ug', json_data['nutrients'][nutri][1])
            json_data['nutrients'][nutri][1] = re.sub('µg', 'ug', json_data['nutrients'][nutri][1])
        
        else:
            json_data['nutrients'][nutri][1] = re.sub('[^a-zA-Z]', '', json_data['nutrients'][nutri][1])

# 4 ) Conversion between mg and g

### 1. Nutrients in grams

In [15]:
{i:j for i,j in nutri_unit_dict.items() if i in\
                ['Protein','Fat','Carbohydrate','Dietary_Fiber',
                'Phenylalanine','Tyrosine','Linoleic_Acid','Alpha_Linolenic_Acid']} 

{'Fat': ['mg', 'g'],
 'Carbohydrate': ['mg', 'g미만', 'g'],
 'Protein': ['mg', 'g미만', 'g'],
 'Dietary_Fiber': ['mg', 'g미만', 'g'],
 'Tyrosine': ['mg'],
 'Alpha_Linolenic_Acid': ['mg'],
 'Phenylalanine': ['mg', 'g'],
 'Linoleic_Acid': ['mg']}

### 2. Nutrients in milligrams

In [16]:
{i:j for i,j in nutri_unit_dict.items() if i in\
            ['Calcium','Iron','Magnesium','Phosphorus','Potassium','Sodium','Zinc','Copper','Manganese',
            'Vitamin_B6','Vitamin_C','Leucine','Iso_Leucine','Histidine','Lysine','Methionine',
            'Threonine','Valine','Cholesterol']} 

{'Vitamin_C': ['mg', 'mg220mg', 'mg11mg', 'g'],
 'Vitamin_B6': ['mg', 'mcg', 'g'],
 'Calcium': ['mg', 'g', 'mcg', 'mg6mg'],
 'Iron': ['mg', 'mcg', 'g'],
 'Magnesium': ['mg', 'mcg', 'g'],
 'Zinc': ['mg', 'mcg'],
 'Copper': ['mg', '2', 'mcg'],
 'Sodium': ['mg', 'mcg', 'g'],
 'Manganese': ['mg', 'mcg', '3'],
 'Potassium': ['mg', 'mcg'],
 'Phosphorus': ['mg'],
 'Cholesterol': ['mg', 'mg미만', 'g'],
 'Lysine': ['mg', 'g'],
 'Methionine': ['mg', 'g'],
 'Histidine': ['mg'],
 'Iso_Leucine': ['mg', 'g'],
 'Leucine': ['mg', 'g'],
 'Threonine': ['mg'],
 'Valine': ['mg', 'g']}

Nutrients with two units

In [17]:
{i:j for i,j in nutri_unit_dict.items() if (i in\
            ['Calcium','Iron','Magnesium','Phosphorus','Potassium','Sodium','Zinc','Copper','Manganese',
            'Vitamin_B6','Vitamin_C','Leucine','Iso_Leucine','Histidine','Lysine','Methionine',
            'Threonine','Valine','Cholesterol'])&(len(j)==2)} 

{'Zinc': ['mg', 'mcg'],
 'Potassium': ['mg', 'mcg'],
 'Lysine': ['mg', 'g'],
 'Methionine': ['mg', 'g'],
 'Iso_Leucine': ['mg', 'g'],
 'Leucine': ['mg', 'g'],
 'Valine': ['mg', 'g']}

Nutrients with more or less than two units

In [18]:
{i:j for i,j in nutri_unit_dict.items() if (i in\
            ['Calcium','Iron','Magnesium','Phosphorus','Potassium','Sodium','Zinc','Copper','Manganese',
            'Vitamin_B6','Vitamin_C','Leucine','Iso_Leucine','Histidine','Lysine','Methionine',
            'Threonine','Valine','Cholesterol'])&(len(j)!=2)} 

{'Vitamin_C': ['mg', 'mg220mg', 'mg11mg', 'g'],
 'Vitamin_B6': ['mg', 'mcg', 'g'],
 'Calcium': ['mg', 'g', 'mcg', 'mg6mg'],
 'Iron': ['mg', 'mcg', 'g'],
 'Magnesium': ['mg', 'mcg', 'g'],
 'Copper': ['mg', '2', 'mcg'],
 'Sodium': ['mg', 'mcg', 'g'],
 'Manganese': ['mg', 'mcg', '3'],
 'Phosphorus': ['mg'],
 'Cholesterol': ['mg', 'mg미만', 'g'],
 'Histidine': ['mg'],
 'Threonine': ['mg']}

### mg_g_conversion
- g -> mg
- mg -> g

In [19]:
def mg_g_conversion(json_data):
    if 'nutrients' in list(json_data.keys()):
        nutri_list = list(json_data['nutrients'].keys())
        for nutri in nutri_list:

            ## g으로 바꿔줘야하는 애들. mg -> g
            if nutri in ['Protein','Fat','Carbohydrate','Dietary_Fiber']:

                if json_data['nutrients'][nutri][1] == 'g':
                    pass
                
                elif json_data['nutrients'][nutri][1] == 'mg':
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])/1000,3))
                    json_data['nutrients'][nutri][1] = 'g'

                else:
                    del json_data['nutrients'] # 단위 이상하면 걍 삭제해버려..
                    break

            ## mg으로 바꿔줘야하는 애들. g -> mg
            elif nutri in ['Calcium','Iron','Magnesium','Copper','Phosphorus','Potassium','Sodium','Zinc','Manganese',
                'Vitamin_B6','Vitamin_C','Leucine','Iso_Leucine','Histidine','Lysine','Methionine',
                'Threonine','Valine','Cholesterol',
                'Phenylalanine','Tyrosine']:

                if json_data['nutrients'][nutri][1] == 'mg':
                    pass
                
                elif json_data['nutrients'][nutri][1] == 'g':
                    json_data['nutrients'][nutri][0] = str(float(json_data['nutrients'][nutri][0])*1000)
                    json_data['nutrients'][nutri][1] = 'mg'

                elif json_data['nutrients'][nutri][1] == 'mgmg':
                    json_data['nutrients'][nutri][1] = 'mg'

                elif json_data['nutrients'][nutri][1] == 'ug': # Copper: mcg -> mg
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])/1000,3))
                    json_data['nutrients'][nutri][1] = 'mg'

                else:
                    del json_data['nutrients'] # 단위 이상하면 걍 삭제해버려..
                    break

# 5 ) Complex cases
- ex. Vitamins..

In [20]:
{i:j for i,j in nutri_unit_dict.items() if i  not in\
            ['Calcium','Iron','Magnesium','Phosphorus','Potassium','Sodium','Zinc','Copper','Manganese',
            'Vitamin_B6','Vitamin_C','Leucine','Iso_Leucine','Histidine','Lysine','Methionine',
            'Threonine','Valine','Cholesterol','Protein','Fat','Carbohydrate','Dietary_Fiber',
                'Phenylalanine','Tyrosine','Linoleic_Acid','Alpha_Linolenic_Acid']} 

{'Vitamin_A': ['mg',
  'µg',
  'ugRAE',
  'µgRAE',
  'mcgREA',
  'IU',
  'mgRAE',
  'IU3166IU',
  'mcg',
  'mcgRAE',
  'I'],
 'Vitamin_D': ['mg',
  'iu',
  'µg',
  'IU25mcg',
  '5',
  'g',
  'IU',
  'mcgIU',
  'mcg400IU',
  'mcg',
  '7',
  'ug',
  'I'],
 'Vitamin_E': ['mg', 'mgAT', 'mg4mg', 'a', 'IU', 'mcg', 'I'],
 'Vitamin_K': ['mg',
  'µg',
  'mcg2000mcg1500mcg11mcg181mcg43mcg',
  'μg',
  'mcg',
  'mcg90mcg80mcg'],
 'Folic_acid': ['mg',
  'ugDFE',
  'mcgDFE엽산',
  'µgDFE',
  'mcgº',
  'mcgDFE',
  'mcgDEF',
  'µg',
  'mcg엽산',
  'mcgDFE엽산400mcg',
  'mcgDFEL',
  'mgDFE',
  'mcg',
  'mcg식이엽산당량',
  'ug',
  'mcgDFE엽산240mcg'],
 'Vitamin_B12': ['mg', 'µg', 'mcgDFE', 'μg', 'mcg', 'ug'],
 'Selenium': ['mg', 'µg', 'mcg'],
 'Niacin': ['mg', 'mgnE', 'mgNE']}

In [21]:
def other_unit_conversion(json_data):
    if 'nutrients' in list(json_data.keys()):
        nutri_list = list(json_data['nutrients'].keys())
        for nutri in nutri_list:

            # 1. Vitamin_A (ug)
            if nutri == 'Vitamin_A': # ['IU', 'IUIU', 'gREA', 'mg', 'gRAE', 'mgRAE', 'g', 'I']
                if json_data['nutrients'][nutri][1] in ['ug', 'ugRAE', 'ug RAE']:
                    pass
                
                elif json_data['nutrients'][nutri][1] in ['IU', 'IUIU','I']: # IU -> ug
                    json_data['nutrients'][nutri][0] = str(float(json_data['nutrients'][nutri][0])*0.33)
                
                elif json_data['nutrients'][nutri][1] in ['gREA','gRAE','g']: # g -> ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**(-6),3))
                
                elif json_data['nutrients'][nutri][1] in ['gREA', 'mg', 'gRAE', 'mgRAE', 'g']: # mg -> ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**(-3),3))
                
                else:
                    del json_data['nutrients']
                    break
                
                try:
                    json_data['nutrients'][nutri][1] = 'ugRAE'
                
                except:
                    pass

            # 2. Vitamin_D (ug)
            elif nutri == 'Vitamin_D': #['', 'iu', 'IU', 'gIU', 'IUg', 'mg', 'g', 'I']
                if json_data['nutrients'][nutri][1] == 'ug':
                    pass
                
                elif json_data['nutrients'][nutri][1] in ['iu', 'IU','I','gIU', 'IUg']: # IU -> ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*0.025,3))

                elif json_data['nutrients'][nutri][1] == 'mg': # mg -> ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**(3),3))

                elif json_data['nutrients'][nutri][1] == 'g': # g -> ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**(6),3))
                
                else:
                    del json_data['nutrients']
                    break     
                
                try:
                    json_data['nutrients'][nutri][1] = 'ug'
                
                except:
                    pass

            # 3. Vitamin_E (mg α-TE)
            elif nutri == 'Vitamin_E': # ['a', 'IU', 'mgmg', 'mgAT', 'mg', 'g', 'I']
                if json_data['nutrients'][nutri][1] == 'mg α-TE':
                    pass
                
                elif json_data['nutrients'][nutri][1] in ['IU','I']: # IU -> mg 
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*0.67,3))
                
                elif json_data['nutrients'][nutri][1] == 'g': # g -> mg
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**3,3))
                
                elif json_data['nutrients'][nutri][1] == 'a':
                    del json_data['nutrients']
                    break
                
                try:
                    json_data['nutrients'][nutri][1] = "mg α-TE"
                
                except:
                    pass
                

            # 4. Vitamin_K (ug)
            elif nutri == 'Vitamin_K': # ['gggggg', 'mg', 'ggg', 'g']
                if json_data['nutrients'][nutri][1] == 'ug':
                    pass
                
                elif json_data['nutrients'][nutri][1] == 'mg': # mg to ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**3,3))
                
                elif json_data['nutrients'][nutri][1] in ['gggggg', 'ggg','g']: # g to ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**6,3))

                else:
                    del json_data['nutrients']
                    break
                
                try:
                    json_data['nutrients'][nutri][1] = 'ug'
                
                except:
                    pass

            # 5. Folic_acid (µg DFE)
            elif nutri == 'Folic_acid': # ['gDFEL', 'mg', 'gDFEg', 'gDEF', 'gDFE', 'mgDFE', 'g']
                if json_data['nutrients'][nutri][1] in ['ug DFE', 'ugDFE', 'ug']:
                    pass
                
                elif json_data['nutrients'][nutri][1] in ['gDFEL', 'gDFEg', 'gDEF', 'gDFE']:
                    del json_data['nutrients']
                    break
                
                elif json_data['nutrients'][nutri][1] == 'mg': # mg to ug DFE
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*2*10**3,3))
                
                elif json_data['nutrients'][nutri][1] == 'g':
                    del json_data['nutrients']
                    break
                
                else:
                    del json_data['nutrients']
                    break

                try:
                    json_data['nutrients'][nutri][1] = 'ugDFE'
                    json_data['nutrients']['Folate'] = json_data['nutrients'].pop(nutri)
                
                except:
                    pass
                

            # 6. Vitamin_B12 (µg)
            elif nutri == 'Vitamin_B12': # ['gDFE', 'mg', 'g']
                if json_data['nutrients'][nutri][1] == 'ug':
                    pass
                
                elif json_data['nutrients'][nutri][1] == 'mg': # mg to ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**3,3))
                
                elif json_data['nutrients'][nutri][1] == 'g': # g to ug
                    del json_data['nutrients']
                    break

                else:
                    del json_data['nutrients']
                    break
                
                try:
                    json_data['nutrients'][nutri][1] = 'ug'
                
                except:
                    pass

            # 7. Selenium (µg)
            elif nutri == 'Selenium': # ['mg','g']
                if json_data['nutrients'][nutri][1] == 'ug':
                    pass

                elif json_data['nutrients'][nutri][1] == 'mg': # mg to ug
                    json_data['nutrients'][nutri][0] = str(round(float(json_data['nutrients'][nutri][0])*10**3,3))
                elif json_data['nutrients'][nutri][1] == 'g': # g to ug
                    del json_data['nutrients']
                    break

                else:
                    del json_data['nutrients']
                    break
                
                try:
                    json_data['nutrients'][nutri][1] = 'ug'
                
                except:
                    pass

            # 8. Niacin
            elif nutri == 'Niacin': # ['mgnE', 'mgNE', 'mg']
                json_data['nutrients'][nutri][1] = 'mgNE'

# 6 ) Apply conversion

In [22]:
def apply_unit_conversion(prod_path, prod_list, prod_new_path):
    for prod in prod_list:

        with open(prod_path+prod, encoding = 'utf-8-sig') as f:
            prod_data = json.load(f)
        
        to_ug(prod_data)
        mg_g_conversion(prod_data)
        other_unit_conversion(prod_data)

        if 'nutrients' in prod_data.keys():
            if prod_path == amino_path:
                amino_selection = ['Leucine', 'Iso_Leucine', 'Histidine', 'Lysine',
                                    'Methionine', 'Phenylalanine', 'Tyrosine', 'Threonine', 'Valine']
                amino_nutri_list = list(prod_data['nutrients'].keys())

                if sum([amino_nutri_list.count(i) for i in amino_selection]) > 0:
                    with open(prod_new_path+'iherb_{}.json'.format(prod_data['prod_cd']), 'w',
                                        encoding = 'UTF-8-sig') as new_file:
                        new_file.write(json.dumps(prod_data, ensure_ascii=False))

            else: 
                with open(prod_new_path+'iherb_{}.json'.format(prod_data['prod_cd']), 'w',
                                        encoding = 'UTF-8-sig') as new_file:
                    new_file.write(json.dumps(prod_data, ensure_ascii=False)) 

Test one example to check functions work properly

In [23]:
with open(amino_path+amino_list[3], encoding = 'utf-8-sig') as f:
    prod_data = json.load(f)

to_ug(prod_data)
print(prod_data)

mg_g_conversion(prod_data)
print(prod_data)

other_unit_conversion(prod_data)
print(prod_data)

{'url': 'https://kr.iherb.com/pr/source-naturals-l-tryptophan-1-000-mg-90-tablets/63430', 'title': 'Source Naturals, L-트립토판, 1,000mg, 90정', 'prod_cd': 'SNS-02610', 'price': '39576', 'nutrients': {'Vitamin_B6': ['20', 'mg'], 'Iron': ['390', 'ug']}, 'serving': ['1', 'Tablet']}
{'url': 'https://kr.iherb.com/pr/source-naturals-l-tryptophan-1-000-mg-90-tablets/63430', 'title': 'Source Naturals, L-트립토판, 1,000mg, 90정', 'prod_cd': 'SNS-02610', 'price': '39576', 'nutrients': {'Vitamin_B6': ['20', 'mg'], 'Iron': ['0.39', 'mg']}, 'serving': ['1', 'Tablet']}
{'url': 'https://kr.iherb.com/pr/source-naturals-l-tryptophan-1-000-mg-90-tablets/63430', 'title': 'Source Naturals, L-트립토판, 1,000mg, 90정', 'prod_cd': 'SNS-02610', 'price': '39576', 'nutrients': {'Vitamin_B6': ['20', 'mg'], 'Iron': ['0.39', 'mg']}, 'serving': ['1', 'Tablet']}


Save processed data to new directories

In [24]:
vit_new_path = '../output_processed2/vitamins/'
apply_unit_conversion(vit_path, vit_list, vit_new_path)

In [25]:
min_new_path = '../output_processed2/minerals/'
apply_unit_conversion(min_path, min_list, min_new_path)

In [26]:
amino_new_path = '../output_processed2/amino-acids/'
apply_unit_conversion(amino_path, amino_list, amino_new_path)