# Purpose of Preprocessing
- Select only the nutrients to use
- Unify nutrient names
- Unify serving unit names

In [1]:
import os
import json
import re

# 0 ) Import data

In [8]:
vit_path = '../output/vitamins/'
min_path = '../output/minerals/'
amino_path = '../output/amino-acids/'

vit_list = [i for i in os.listdir(vit_path) if i != '.DS_Store']
min_list = [i for i in os.listdir(min_path) if i != '.DS_Store']
amino_list = [i for i in os.listdir(amino_path) if i != '.DS_Store']

In [None]:
print(len(vit_list), len(min_list), len(amino_list))

In [None]:
print(len(vit_list)+len(min_list)+len(amino_list))

# 1 ) Unify nutrients name

### 1.1) get_nutri_key
- Get total nutrients name
- Check if there is scrap error

In [13]:
def get_nutri_key(prod_path, prod_list):
    total_nutri_list = []
    error_cnt = 0
    
    for prod in prod_list:
        with open(prod_path+prod, encoding = 'utf-8-sig') as f:
            prod_data = json.load(f)
            try:
                prod_nutri_list = list(prod_data['nutrients'].keys())
                total_nutri_list.extend(prod_nutri_list)
            except:
                error_cnt += 1
                print(prod_data['prod_cd'])
    print(error_cnt) # To check scrap error

    total_nutri_list = sorted(list(set(total_nutri_list)))
    return total_nutri_list

In [None]:
vit_nutri_list = get_nutri_key(vit_path, vit_list)
min_nutri_list = get_nutri_key(min_path, min_list)
amino_nutri_list = get_nutri_key(amino_path, amino_list)

- Create "total_nutri_list" which has total nutrients names

In [None]:
total_nutri_list = vit_nutri_list + min_nutri_list + amino_nutri_list # 모든 nutrients 명이 담겨있는 리스트
total_nutri_list = list(set(total_nutri_list))
print(len(total_nutri_list))

- Query nutrient name within total_nutri_list
- Create a list of analogous nutrient names for each nutrient in Excel

In [None]:
# Nutrients name with English
[i for i in total_nutri_list if 'vitamind' in i.lower()]
# ['VitaminD', 'VitaminD-3', 'VitaminD2', 'VitaminD3', 'VitaminD35', 'VitaminDOrganic', 'VitaminDs']

In [None]:
# Nutrients name with Korean
[i for i in total_nutri_list if '단백질' in i]
# ['단백질', '단백질최대', '분리대두단백질및제공성분', '쌀단백질농축물', '완두콩단백질', '통현미단백질', '현미단백질']

## 1.2) change_nutri_nm
- Delete 'Calories' key
- Delete other keys except for what I will use
- Unify nutrient names

In [21]:
def change_nutri_nm(json):
    
    for key in list(json['nutrients'].keys()):

        # 1. Protein
        if key in ['Protein','RiceProteinConcentrate', '단백질', '단백질최대', '분리대두단백질및제공성분', '쌀단백질농축물', '완두콩단백질', '통현미단백질', '현미단백질']:
            json['nutrients']['Protein'] = json['nutrients'].pop(key)
        
        # 2. Fat
        elif key in ['Fat','TotalFat', 'EssentialFattyAcid-RichWholeFoodEnergy','EssentialFattyAcidComplex','총지방']:
            json['nutrients']['Fat'] = json['nutrients'].pop(key)
        
        # 3. Carbohydrate
        elif key in ['Carbohydrate','TotalCarb', 'TotalCarbohydrate', 'TotalCarbohydrates','총탄수화물', '탄수화물']:
            json['nutrients']['Carbohydrate'] = json['nutrients'].pop(key)
        
        # 4. Dietary_Fiber
        elif key in ['DietaryFiber', '식이섬유', '프리바이오틱식이섬유']:
            json['nutrients']['Dietary_Fiber'] = json['nutrients'].pop(key)
        
        # 5. Calcium
        elif key in ['Calcium','CoralCalcium', '구연산칼슘','미결정하이드록시아파타이트칼슘','미세결정질칼슘하이드록시아파타이트','산호칼슘',
                    '유기농식물칼슘혼합물유기농해조류','칼슘','칼슘D-글루카레이트','칼슘프룩토보레이트','칼슘프룩토보레이트보론6mg공급','화석화된산호칼슘']:
            json['nutrients']['Calcium'] = json['nutrients'].pop(key)
        
        # 6. Iron
        elif key in ['Iron', '철', '철분']:
            json['nutrients']['Iron'] = json['nutrients'].pop(key)
        
        # 7. Magnesium
        elif key in ['Magnesium','PureHighGradeIonicMagnesium', '마그네슘','마그네슘L-트레오네이트','마그네슘말레이트트라이하이드레이트',
                        '마그네슘말산염트라이하이드레이트','마그네슘원소', '순수고급이온성마그네슘','원소마그네슘']:
            json['nutrients']['Magnesium'] = json['nutrients'].pop(key)
        
        # 8. Phosphorus
        elif key in ['Phosphorus','인']:
            json['nutrients']['Phosphorus'] = json['nutrients'].pop(key)
        
        # 9. Potassium
        elif key in ['Potassium','구연산칼륨', '칼륨']:
            json['nutrients']['Potassium'] = json['nutrients'].pop(key)
        
        # 10. Sodium
        elif key in ['Sodium','나트륨']:
            json['nutrients']['Sodium'] = json['nutrients'].pop(key)
        
        # 11. Zinc
        elif key in ['Zinc','산화아연', '아연', '아연-L-카르노신', '아연L-카르노신']:
            json['nutrients']['Zinc'] = json['nutrients'].pop(key)
        
        # 12. Copper
        elif key in ['Copper','구리']:
            json['nutrients']['Copper'] = json['nutrients'].pop(key)
        
        # 13. Manganese
        elif key in ['Manganese','구연산망간', '망간']:
            json['nutrients']['Manganese'] = json['nutrients'].pop(key)
        
        # 14. Selenium
        elif key in ['Selenium','셀레늄']:
            json['nutrients']['Selenium'] = json['nutrients'].pop(key)

        # 15. Vitamin_A
        elif key in ['VitaminA', 'VitaminA100', '비타민A', '전체식품영양소비타민A', '카로티노이드함유비타민A']:
            json['nutrients']['Vitamin_A'] = json['nutrients'].pop(key)

        # 16. Vitamin_D
        elif key in ['VitaminD','VitaminD-3','VitaminD2','VitaminD3','VitaminD35','VitaminDOrganic', 'VitaminDs',
                    '비타민D', '비타민D-3', '비타민D2', '비타민D25mcg', '비타민D3']:
            json['nutrients']['Vitamin_D'] = json['nutrients'].pop(key)

        # 17. Niacin
        elif key in ['Niacin','나이아신', '비타민B-3나이아신']:
            json['nutrients']['Niacin'] = json['nutrients'].pop(key)

        # 18. Folic_acid
        elif key in ['Folic', 'FolicAcid', 'Folicacid', 'Folate','엽산', '엽산C', '엽산염', '엽산해당량']:
            json['nutrients']['Folic_acid'] = json['nutrients'].pop(key)

        # 19. Vitamin_B12
        elif key in ['VitaminB-12','VitaminB-12asCyanocobalamin', 'VitaminB12','비타민B-12', '비타민B12']:
            json['nutrients']['Vitamin_B12'] = json['nutrients'].pop(key)

        # 20. Vitamin_B6
        elif key in ['VitaminB-6', 'VitaminB6', '비타민B-6', '비타민B6']:
            json['nutrients']['Vitamin_B6'] = json['nutrients'].pop(key)

        # 21. Vitamin_C
        elif key in ['VitaminC','VitaminCSupportBase','VitaminCasPureWay-C','VitaminCfromPureway-C',
                    'Living비타민C복합체유기농아세로라체리','PureWay-C에서추출한비타민C','비타민C','비타민C보조물','퓨어웨이-C에서추출한비타민C']:
            json['nutrients']['Vitamin_C'] = json['nutrients'].pop(key)

        # 22. Vitamin_E
        elif key in ['VitaminE','감마-비타민E복합체','비타민E','비타민E함유','비타민E혼합토코페롤','토코페롤및토코트리에놀함유비타민E']:
            json['nutrients']['Vitamin_E'] = json['nutrients'].pop(key)

        # 23. Vitamin_K
        elif key in ['VitaminK', 'VitaminK-2', 'VitaminK1', 'VitaminK2', 'VitaminKactivityfrom', 
                 '비타민K','비타민K-1','비타민K-2','비타민K1','비타민K12','비타민K2','비타민K2-7','비타민K2MK-4',
                '비타민K2MK-7','비타민KK-2','비타민KK1','비타민K비타민K2','비타민K활성량비타민K종류','비타민K활성성분']:
            json['nutrients']['Vitamin_K'] = json['nutrients'].pop(key)

        # 24. Leucine
        elif key in ['Leucine','InstantlySolubleBCAAsL-Leucine','L-Leucine','ProprietaryBranchedChainAminoAcidComplexL-Leucine',
        	'AjiPureMP9EAAMatrixL-류신','L-류신','류신','류신펩티드','미분화된L-류신', '이소류신','즉시용해되는BCAAL-류신']:
            json['nutrients']['Leucine'] = json['nutrients'].pop(key)

        # 25. Iso_Leucine
        elif key in ['Isoleucine', 'L-Isoleucine', 'L-아이소류신','L-이소류신', '미분화된L-이소류신','아이소류신','이소류신']:
            json['nutrients']['Iso_Leucine'] = json['nutrients'].pop(key)

        # 26. Histidine
        elif key in ['Histidine','L-HistidineHCL','L-히스티딘', 'L-히스티딘HCI', 'L-히스티딘HCl', 'L-히스티딘에이치씨엘', '히스티딘']:
            json['nutrients']['Histidine'] = json['nutrients'].pop(key)

        # 27. Linoleic_Acid
        elif key in ['ConjugatedLinoleicAcidPowder']:
            json['nutrients']['Linoleic_Acid'] = json['nutrients'].pop(key)

        # 28. Alpha_Linolenic_Acid
        elif key in ['필수지방산복합체알파리놀렌산']:
            json['nutrients']['Alpha_Linolenic_Acid'] = json['nutrients'].pop(key)

        # 29. Lysine
        elif key in ['Lysine','Hydroxylysine','L-Lysine','L-LysineFree-Form','L-LysineHCL','L-LysineHCl','L-lysine',
                'L-라이신','L-라이신HCI','L-라이신HCl','L-라이신에이치씨엘','L-라이신염산염','라이신', '라이신염산염','비건콜라겐강화혼합물L-라이신HCI',
                '알파뉴로복합체글라이신L-라이신HCIL-글루타민L-알지닌HCI알파-GPC','필수아미노산L-라이신', 'L-리신', 'L-리신HCl']:
            json['nutrients']['Lysine'] = json['nutrients'].pop(key)

        # 30. Methionine
        elif key in ['Methionine','L-Methionine', 'L-메티오닌', 'dl-메티오닌', '메티오닌']:
            json['nutrients']['Methionine'] = json['nutrients'].pop(key)

        # 31. Phenylalanine
        elif key in ['Phenylalanine','DL-Phenylalanine', 'L-Phenylalanine', 'D-페닐알라닌', 'DL-페닐알라닌', 'L-페닐알라닌', '페닐알라닌']:
            json['nutrients']['Phenylalanine'] = json['nutrients'].pop(key)

        # 32. Tyrosine
        elif key in ['Tyrosine','L-Tyrosine', 'L-티로신', 'N-아세틸L-티로신', '티로신']:
            json['nutrients']['Tyrosine'] = json['nutrients'].pop(key)

        # 33. Threonine
        elif key in ['Threonine','L-Threonine','L-트레오닌', '트레오닌']:
            json['nutrients']['Threonine'] = json['nutrients'].pop(key)

        # 34. Valine
        elif key in ['Valine','L-Valine', 'L-발린', '미분화된L-발린', '발린']:
            json['nutrients']['Valine'] = json['nutrients'].pop(key)

        # 35. Cholesterol
        elif key in ['Cholesterol', '콜레스테롤']:
            json['nutrients']['Cholesterol'] = json['nutrients'].pop(key)
        
        # 칼로리, Calories 삭제
        elif key in ['칼로리', 'Calories']:
            del json['nutrients'][key]
        
        # Others
        else:
            del json['nutrients'][key]  

# 2 ) Unify unit and serving unit

- Check the number of serving errors when scraping data

In [22]:
serving_error_list = []
for vit in vit_list:
    with open(vit_path+vit) as f:
        vit_data = json.load(f)
        if vit_data['serving'] == 'serving unit error':
            serving_error_list.append(vit_data)

for min in min_list:
    with open(min_path+min) as f:
        min_data = json.load(f)
        if min_data['serving'] == 'serving unit error':
            serving_error_list.append(min_data)

for amino in amino_list:
    with open(amino_path+amino) as f:
        amino_data = json.load(f)
        if amino_data['serving'] == 'serving unit error':
            serving_error_list.append(amino_data)

In [None]:
print(len(serving_error_list))

## 2.1 ) get_serving
- Get serving amount & unit

In [24]:
def get_serving(prod_path, prod_list):
    serving_amount_list = []
    serving_unit_list = []
    error_cnt = 0
    
    for prod in prod_list:
        with open(prod_path+prod) as f:
            prod_data = json.load(f)
            try:
                serving_amount_list.append(prod_data['serving'][0])
                serving_unit_list.append(prod_data['serving'][1])
            except:
                error_cnt += 1
    print(error_cnt)
    return serving_amount_list, serving_unit_list

In [25]:
vit_serving_amount_list, vit_serving_unit_list = get_serving(vit_path, vit_list)
min_serving_amount_list, min_serving_unit_list = get_serving(min_path, min_list)
amino_serving_amount_list, amino_serving_unit_list = get_serving(amino_path, amino_list)

0
0
0


- Create lists of total serving amount and unit

In [26]:
total_serving_amount_list = vit_serving_amount_list + min_serving_amount_list + amino_serving_amount_list
total_serving_unit_list = vit_serving_unit_list + min_serving_unit_list + amino_serving_unit_list

- Create dictionaries to check the frequency of each serving amount and unit

In [None]:
# frequency of each serving amount
total_serving_amount_dict = {i:total_serving_amount_list.count(i) for i in total_serving_amount_list}
total_serving_amount_dict = sorted(total_serving_amount_dict.items(), key=lambda x: -x[1])

# frequency of each serving unit
total_serving_unit_dict = {i:total_serving_unit_list.count(i) for i in total_serving_unit_list}
total_serving_unit_dict = sorted(total_serving_unit_dict.items(), key=lambda x: -x[1])

In [27]:
total_serving_amount_dict

[('1', 1938),
 ('2', 510),
 ('3', 205),
 ('4', 140),
 ('6', 33),
 ('5', 32),
 ('8', 12),
 ('30', 8),
 ('60', 4),
 ('10', 4),
 ('25', 3),
 ('9', 2),
 ('15', 2),
 ('75', 2),
 ('13', 2),
 ('12', 2),
 ('0', 1),
 ('50', 1),
 ('120', 1),
 ('35', 1),
 ('54', 1)]

In [28]:
total_serving_unit_dict

[('정', 1684),
 ('캡슐', 104),
 ('Tablet', 97),
 ('티스푼', 86),
 ('Capsule', 83),
 ('스쿱', 72),
 ('개', 67),
 ('Softgel', 50),
 ('방울', 48),
 ('Capsules', 47),
 ('Tablets', 30),
 ('테이블스푼', 27),
 ('ml', 26),
 ('g', 25),
 ('capsule', 18),
 ('식물성', 17),
 ('tablet', 16),
 ('세', 15),
 ('회', 15),
 ('채식', 14),
 ('Lozenge', 13),
 ('Softgels', 12),
 ('VegCap', 12),
 ('Teaspoon', 12),
 ('팩', 11),
 ('봉지', 11),
 ('Drops', 10),
 ('Vegetarian', 10),
 ('scoop', 10),
 ('Scoop', 10),
 ('Caplet', 9),
 ('MicroLingual', 8),
 ('Chewable', 7),
 ('곰', 7),
 ('소프트젤', 7),
 ('Quick', 7),
 ('Level', 7),
 ('VegCaps', 7),
 ('teaspoons', 7),
 ('큰', 6),
 ('Veg', 6),
 ('Vegetable', 6),
 ('찻', 6),
 ('capsules', 5),
 ('Stick', 5),
 ('mL', 5),
 ('Tube', 5),
 ('tsp', 5),
 ('츄어블', 5),
 ('tbsp', 4),
 ('소프트', 4),
 ('마리', 4),
 ('Drop', 4),
 ('Veggie', 4),
 ('종류의', 4),
 ('tablets', 4),
 ('Sprays', 4),
 ('국자', 4),
 ('특종', 4),
 ('scoops', 4),
 ('stick', 3),
 ('정을', 3),
 ('드롭퍼', 3),
 ('vegetarian', 3),
 ('Packet', 3),
 ('drop', 3),
 ('정량

- Create a list of analogous units using this dictionary in Excel to unify serving units<br>
<br>
ex. ['VegCap','Vegetarian','VegCaps','VeganCaps','Veg Capsule','VCAP'] -> Vege Capsule

## 2.2 ) change_serving_unit_nm
- Unify serving units

In [29]:
def change_serving_unit_nm(json):

    if json['serving'][1] in ['Tablet','정','개', 'Tablets', 'tablet', '회', 'Lozenge',
                                'MicroLingual','Chewable','츄어블','마리','tablets','정을','정량','마름모꼴',
                                'Micro','Organitab','Pieces','츄어블정','chewable','lozenge','개의', 'tables','caplets']:
        json['serving'][1] = 'Tablet'

    elif json ['serving'][1] in ['Capsule','캡슐', 'Capsules', 'capsule','Caplet','capsules','Coated']:
        json['serving'][1] = 'Capsule'

    elif json ['serving'][1] in ['Softgel','Softgels','소프트젤','소프트','softgel','Soft']:
        json['serving'][1] = 'Softgel'

    elif json ['serving'][1] in ['Scoop','스쿱','scoop','scoops','국자','Scoops']:
        json['serving'][1] = 'Scoop'

    elif json ['serving'][1] in ['tbsp','테이블스푼','큰','Tablespoons','Tbsp']:
        json['serving'][1] = 'tbsp'

    elif json ['serving'][1] in ['tsp','티스푼','teaspoons','Teaspoon','teaspoon','찻','찻숫가락']:
        json['serving'][1] = 'tsp'

    elif json ['serving'][1] in ['Drop','방울','드롭퍼','Drops','Dropper','drop','드롭']:
        json['serving'][1] = 'Drop'

    elif json ['serving'][1] in ['Packet','봉지', '팩','packet']:
        json['serving'][1] = 'Packet'

    elif json ['serving'][1] in ['g','Grams','그램']:
        json['serving'][1] = 'g'

    elif json ['serving'][1] in ['ml','mL','액상','Milliliter']:
        json['serving'][1] = 'ml'

    elif json ['serving'][1] in ['Gummies','젤리','Jellies','gummy']:
        json['serving'][1] = 'Gummies'
        
    elif json ['serving'][1] in ['Veg Capsule','식물성','채식','VegCap','Vegetarian','VegCaps','Veg','Vegetable','Veggie',
                                    'vegeterian','VeganCaps','Veg Capsule','VCAP','vegetable']:
        json['serving'][1] = 'Veg Capsule'

    elif json ['serving'][1] in ['Stick','stick']:
        json['serving'][1] = 'Stick'

    elif json ['serving'][1] in ['Spray','Sprays']:
        json['serving'][1] = 'Spray'

    elif json['serving'][1] in ['Tube', 'mg']:
        pass

    else:
        del json['serving']
    

## 2.3 ) filter_serving_amount
- Filter data that has zero serving amount <- maybe scrap error..

In [30]:
def filter_serving_amount(json):
    if json['serving'][0] == '0':
        del json['serving']

## 3 ) apply_change
- apply change to data with functions above and save to new directory
    - change_nutri_nm
    - change_serving_unit_nm
    - filter_serving_amount functions

In [31]:
def apply_change_nutri_nm(prod_path, prod_list, prod_new_path):
    
    for prod in prod_list:
        with open(prod_path + prod, 'r') as file:
            prod_data = json.load(file)
        if (prod_data['nutrients'] != 'nutrients_dict error') & (prod_data['serving'] != "serving unit error"):
            change_nutri_nm(prod_data)
            change_serving_unit_nm(prod_data)
            if 'serving' in list(prod_data.keys()):
                filter_serving_amount(prod_data)

            try:
                if prod_data['nutrients']:
                    if 'serving' in list(prod_data.keys()):
                        with open(prod_new_path+'iherb_{}.json'.format(prod_data['prod_cd']), 'w',
                                encoding = 'UTF-8-sig') as new_file:
                            new_file.write(json.dumps(prod_data, ensure_ascii=False))
            except:
                print(prod_data['prod_cd'])

In [32]:
vit_new_path = '../output_processed/vitamins/'
apply_change_nutri_nm(vit_path, vit_list, vit_new_path)

In [33]:
min_new_path = '../output_processed/minerals/'
apply_change_nutri_nm(min_path, min_list, min_new_path)

In [34]:
amino_new_path = '../output_processed/amino-acids/'
apply_change_nutri_nm(amino_path, amino_list, amino_new_path)

- Difference in the number of data before and after preprocessing

In [35]:
vit_processed_list = os.listdir(vit_new_path)
min_processed_list = os.listdir(min_new_path)
amino_processed_list = os.listdir(amino_new_path)

# Before
print(len(vit_list), len(min_list), len(amino_list))
print(len(vit_list)+len(min_list)+len(amino_list))

# After
print(len(vit_processed_list), len(min_processed_list), len(amino_processed_list))
print(len(vit_processed_list)+len(min_processed_list)+len(amino_processed_list))

# Difference
print(len(vit_list)+len(min_list)+len(amino_list)-len(vit_processed_list)-len(min_processed_list)-len(amino_processed_list))

1680 789 435
2904
1526 678 197
2401
503
