In [159]:
import pandas as pd
import numpy as np

In [160]:
df1 = pd.read_csv("/home/ritushwar/Nutritional-Assistance/metadata/ingredients_metadata.csv")
df1.head(10)

Unnamed: 0,ingr,id,cal/g,fat(g),carb(g),protein(g)
0,cottage cheese,1,0.98,0.043,0.034,0.11
1,strawberries,2,0.33,0.003,0.08,0.007
2,garden salad,3,0.646,0.034,0.032,0.061
3,bacon,4,5.41,0.42,0.014,0.37
4,potatoes,5,0.77,0.001,0.17,0.02
5,caesar salad,6,0.44,0.021,0.043,0.032
6,cauliflower,7,0.25,0.003,0.05,0.018
7,scrambled eggs,8,1.48,0.11,0.016,0.1
8,wild rice,9,1.19,0.004,0.25,0.05
9,steak,10,2.71,0.19,0.0,0.25


In [161]:
def split_data(row):   # input must me single row
    macro_data = row[:6]  # first six belongs to macro data
    ingr_data = row[6:]  # after that belongs to ingr data
    ingr_data = [ingr_data[i:i+7] for i in range(0, len(ingr_data), 7)]   # include from 0 to 6
    return macro_data, ingr_data

In [162]:
def get_names(ingr_data):  # input must be row of single dish
    return [item[1] for item in ingr_data]  # item[1] refers to name

In [163]:
def get_masses(ingr_data):
    return [float(item[2]) for item in ingr_data]  #item[2] refers to mass

In [164]:
def get_most_common_ingr(rows, skip=None):
    mass_map = {}
    count_map = {}
    for row in rows:
        _, ingr_data = split_data(row)
        names = get_names(ingr_data)
        masses = get_masses(ingr_data)
        for i, name in enumerate(names):
            if name == '':
                continue
            if name not in mass_map:
                mass_map[name] = 0
            mass_map[name] += masses[i]
            if name not in count_map:
                count_map[name] = 0
            count_map[name] += 1
    for item in skip:
        if item in mass_map:
            del mass_map[item]
    return mass_map, count_map

In [165]:
def calculate_data_from_ingrs(ingrs_data):   # single shell
    total_calories = 0
    total_fat = 0
    total_carb = 0
    total_protein = 0
    for ingr in ingrs_data:
        total_calories += float(ingr[3])
        total_fat += float(ingr[4])
        total_carb += float(ingr[5])
        total_protein += float(ingr[6])
    return total_calories, total_fat, total_carb, total_protein

In [166]:
def process_row(row, class_map, top_n=None):
    macro_data, ingr_data = split_data(row)
    result = {
        'id': macro_data[0],
        'total_calories': float(macro_data[1]),
        'total_mass':float(macro_data[2]),
        'total_fat':float(macro_data[3]),
        'total_carb':float(macro_data[4]),
        'total_protein':float(macro_data[5]),
    }
    if result['total_calories'] == 0:
        total_calories, total_fat, total_carb, total_protein = calculate_data_from_ingrs(ingr_data)
        #print(total_calories, total_fat, total_carb, total_protein)
        result['total_calories'] = total_calories
        result['total_fat'] = total_fat
        result['total_carb'] = total_carb
        result['total_protein'] = total_protein

    names = get_names(ingr_data)
    masses = get_masses(ingr_data)
    masses_percent = [item / result['total_mass'] for item in masses]

    filtered_names = []
    filtered_masses_percent = []
    for i, name in enumerate(names):   # only items that represents at least 5% of the total mass of the meal
        if masses_percent[i] >= 0.05:
            filtered_names.append(name)
            filtered_masses_percent.append(masses_percent[i])
    asort = np.argsort(filtered_masses_percent[::-1])
    sorted_arr = np.array(filtered_names)[asort]
    filtered = [item for item in sorted_arr if item in class_map]
    if top_n is not None:
        filtered = filtered[:top_n]
    labels = []
    for item in filtered:
        if item != '':
            labels.append(item)
    result['label'] = labels if len(labels) > 0 else np.nan
    return result

In [167]:
import csv
rows = []
with open('/home/ritushwar/Nutritional-Assistance/metadata/dish_metadata_cafe1.csv') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        filtered_row = [value for value in row if value.strip()]
        rows.append(filtered_row)
with open('/home/ritushwar/Nutritional-Assistance/metadata/dish_metadata_cafe2.csv') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        filtered_row = [value for value in row if value.strip()]
        rows.append(filtered_row)

In [168]:
len(rows)

5006

In [None]:
"""row_index = 0
ccnt = []
step = 0
for row in rows:
    if row_index >= 4768:
        row_index %= 4768
    else:
        row_index +=1
    for ele in row:
        step += 1
        if ele.startswith('ingr'):
            if step == 6:
                print(row_index)
            ccnt.append(step)
            step = 0
    step = 0 """

"row_index = 0\nccnt = []\nstep = 0\nfor row in rows:\n    if row_index >= 4768:\n        row_index %= 4768\n    else:\n        row_index +=1\n    for ele in row:\n        step += 1\n        if ele.startswith('ingr'):\n            if step == 6:\n                print(row_index)\n            ccnt.append(step)\n            step = 0\n    step = 0 "

In [169]:
skip = ['olive oil', 'salt', 'pepper', 'vinegar', 'coffee', 'plate only', 'vegetable oil', 'deprecated']

In [170]:
n = 75
mass_map, count_map = get_most_common_ingr(rows, skip)
mass_df = pd.DataFrame(mass_map.items(), columns=['ingr','mass']).set_index('ingr')
count_df = pd.DataFrame(count_map.items(), columns=['ingr', 'count']).set_index('ingr')
ingredients_df = mass_df.join(count_df).reset_index()
ingredients_df.head(20)

Unnamed: 0,ingr,mass,count
0,soy sauce,268.705771,123
1,garlic,1244.628944,1012
2,white rice,13030.895709,323
3,parsley,489.541188,523
4,onions,5991.911975,749
5,brown rice,5376.194467,239
6,apple,19397.623488,239
7,mixed greens,12893.382258,390
8,sugar,412.62529,50
9,lemon juice,862.331321,613


In [171]:
if n is not None:
    ingredients_df = ingredients_df.nlargest(n, ['mass'])
ingredients_df['mass_weight'] = ingredients_df['mass'].max()/ingredients_df['mass']
ingredients_df['count_weight'] = ingredients_df['count'].max()/ingredients_df['count']
ingredients_df.head()

Unnamed: 0,ingr,mass,count,mass_weight,count_weight
36,egg whites,47794.333297,200,1.0,3.745
48,chicken,46796.594974,400,1.021321,1.8725
33,scrambled eggs,39772.309517,331,1.201699,2.26284
81,olives,37301.340391,297,1.281303,2.521886
39,cauliflower,29629.891235,340,1.613045,2.202941


In [172]:
len(ingredients_df)

75

In [173]:
ingredients_df.to_csv('./classes.csv')

In [174]:
unique_ingredients = set(ingredients_df['ingr'])
len(unique_ingredients)

75

In [184]:
top_n = 5
items = []
for row in rows:
    processed = process_row(row, unique_ingredients, top_n)
    items.append(processed)

In [186]:
df1 = pd.DataFrame(items)
df1.shape[0]

5006

In [187]:
df1 = df1.dropna()
df1.shape[0]

4770

In [188]:
df1 = df1.reset_index(drop=True)

In [189]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4770 entries, 0 to 4769
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              4770 non-null   object 
 1   total_calories  4770 non-null   float64
 2   total_mass      4770 non-null   float64
 3   total_fat       4770 non-null   float64
 4   total_carb      4770 non-null   float64
 5   total_protein   4770 non-null   float64
 6   label           4770 non-null   object 
dtypes: float64(5), object(2)
memory usage: 261.0+ KB


In [190]:
df1.head()

Unnamed: 0,id,total_calories,total_mass,total_fat,total_carb,total_protein,label
0,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,"[mixed greens, brown rice, pork]"
1,dish_1562688426,137.569992,88.0,8.256,5.19,10.297,"[chicken apple sausage, roasted potatoes]"
2,dish_1561662054,419.438782,292.0,23.838249,26.351543,25.910593,"[mixed greens, pork, apple, green beans]"
3,dish_1562008979,382.936646,290.0,22.224644,10.17357,35.345387,"[wheat berry, caesar salad, pork]"
4,dish_1560455030,20.59,103.0,0.148,4.625,0.956,"[cherry tomatoes, cucumbers]"


In [191]:
df1.to_csv('/home/ritushwar/Nutritional-Assistance/metadata/dish_meta_data.csv', index=False)

In [192]:
df = pd.read_csv("/home/ritushwar/Nutritional-Assistance/metadata/dish_meta_data.csv")
df.head(5)

Unnamed: 0,id,total_calories,total_mass,total_fat,total_carb,total_protein,label
0,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,"[np.str_('mixed greens'), np.str_('brown rice'..."
1,dish_1562688426,137.569992,88.0,8.256,5.19,10.297,"[np.str_('chicken apple sausage'), np.str_('ro..."
2,dish_1561662054,419.438782,292.0,23.838249,26.351543,25.910593,"[np.str_('mixed greens'), np.str_('pork'), np...."
3,dish_1562008979,382.936646,290.0,22.224644,10.17357,35.345387,"[np.str_('wheat berry'), np.str_('caesar salad..."
4,dish_1560455030,20.59,103.0,0.148,4.625,0.956,"[np.str_('cherry tomatoes'), np.str_('cucumber..."


In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4770 entries, 0 to 4769
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              4770 non-null   object 
 1   total_calories  4770 non-null   float64
 2   total_mass      4770 non-null   float64
 3   total_fat       4770 non-null   float64
 4   total_carb      4770 non-null   float64
 5   total_protein   4770 non-null   float64
 6   label           4770 non-null   object 
dtypes: float64(5), object(2)
memory usage: 261.0+ KB


In [106]:
# making the list of directory in the data
import os
directory_list = []
res = {}
path = "/home/ritushwar/Nutritional-Assistance/data"
for dir in os.listdir(path):
    res = {"id": 0}
    res["id"] = dir
    directory_list.append(res)

In [107]:
directory_list

[{'id': 'dish_1574711589'},
 {'id': 'dish_1558640593'},
 {'id': 'dish_1550862993'},
 {'id': 'dish_1561577731'},
 {'id': 'dish_1565637504'},
 {'id': 'dish_1551394781'},
 {'id': 'dish_1559242066'},
 {'id': 'dish_1560356337'},
 {'id': 'dish_1558380152'},
 {'id': 'dish_1551224283'},
 {'id': 'dish_1563812508'},
 {'id': 'dish_1559061675'},
 {'id': 'dish_1551568285'},
 {'id': 'dish_1550769483'},
 {'id': 'dish_1561578800'},
 {'id': 'dish_1558026756'},
 {'id': 'dish_1559933834'},
 {'id': 'dish_1565382762'},
 {'id': 'dish_1551135590'},
 {'id': 'dish_1560367153'},
 {'id': 'dish_1565118999'},
 {'id': 'dish_1551563213'},
 {'id': 'dish_1562602627'},
 {'id': 'dish_1551378955'},
 {'id': 'dish_1551397591'},
 {'id': 'dish_1558113154'},
 {'id': 'dish_1561654346'},
 {'id': 'dish_1573073666'},
 {'id': 'dish_1566849955'},
 {'id': 'dish_1567542914'},
 {'id': 'dish_1558381206'},
 {'id': 'dish_1551236466'},
 {'id': 'dish_1568315988'},
 {'id': 'dish_1550708556'},
 {'id': 'dish_1551390528'},
 {'id': 'dish_155077

In [71]:
df2 = pd.DataFrame(directory_list)

In [72]:
df2.head(5)

Unnamed: 0,id
0,dish_1574711589
1,dish_1558640593
2,dish_1550862993
3,dish_1561577731
4,dish_1565637504


In [73]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4793 entries, 0 to 4792
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      4793 non-null   object
dtypes: object(1)
memory usage: 37.6+ KB


In [74]:
df2.to_csv("/home/ritushwar/Nutritional-Assistance/metadata/dish_id.csv")

In [194]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4770 entries, 0 to 4769
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              4770 non-null   object 
 1   total_calories  4770 non-null   float64
 2   total_mass      4770 non-null   float64
 3   total_fat       4770 non-null   float64
 4   total_carb      4770 non-null   float64
 5   total_protein   4770 non-null   float64
 6   label           4770 non-null   object 
dtypes: float64(5), object(2)
memory usage: 261.0+ KB


In [109]:
df1.head(10)

Unnamed: 0,id,total_calories,total_mass,total_fat,total_carb,total_protein,label
0,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,"[mixed greens, brown rice, pork]"
1,dish_1562688426,137.569992,88.0,8.256,5.19,10.297,"[chicken apple sausage, roasted potatoes]"
2,dish_1561662054,419.438782,292.0,23.838249,26.351543,25.910593,"[mixed greens, pork, apple, green beans]"
3,dish_1562008979,382.936646,290.0,22.224644,10.17357,35.345387,"[wheat berry, caesar salad, pork]"
4,dish_1560455030,20.59,103.0,0.148,4.625,0.956,"[cherry tomatoes, cucumbers]"
5,dish_1565640549,45.482903,139.0,1.568471,7.043886,2.641478,"[tomatoes, cherry tomatoes, asparagus, arugula..."
6,dish_1563207364,309.269989,271.0,13.774,30.657,15.01,"[grapes, scrambled eggs, egg whites, yam, swee..."
7,dish_1561575474,120.058434,183.0,4.966118,17.412746,2.990431,"[cauliflower, roasted potatoes, eggplant]"
8,dish_1550795690,68.119995,131.0,0.262,18.34,0.393,[apple]
9,dish_1563216717,246.007996,332.0,5.636517,16.284782,33.06871,"[chicken breast, broccoli, honeydew melons, sq..."


In [195]:
df1.iloc[0,0]   # row, column

'dish_1561662216'

In [196]:
df1.shape[0]

4770

In [197]:
import os
directory_list = []
path = "/home/ritushwar/Nutritional-Assistance/data"
for dir in os.listdir(path):
    directory_list.append(dir)

In [198]:
len(directory_list)

4571

In [199]:
iy, ino = 0, 0
empty_indx = []
for ro_no in range(df1.shape[0]):
    id = df1.iloc[ro_no,0]
    if id in directory_list:
        iy +=1
    else:
        ino+=1
        empty_indx.append(ro_no)

In [200]:
print(iy, ino)

4571 199


In [201]:
print(empty_indx)
print(len(empty_indx))

[28, 47, 66, 72, 79, 91, 104, 132, 135, 147, 202, 209, 211, 319, 328, 390, 458, 494, 528, 556, 568, 664, 667, 675, 737, 742, 752, 757, 811, 828, 829, 836, 880, 882, 907, 981, 989, 1091, 1133, 1203, 1209, 1229, 1231, 1292, 1313, 1394, 1413, 1427, 1441, 1514, 1553, 1555, 1587, 1588, 1614, 1619, 1648, 1697, 1708, 1727, 1730, 1768, 1801, 1874, 1887, 1960, 2039, 2043, 2110, 2159, 2168, 2195, 2219, 2257, 2267, 2284, 2352, 2382, 2403, 2443, 2548, 2554, 2558, 2565, 2640, 2650, 2719, 2747, 2771, 2813, 2886, 2987, 3054, 3055, 3075, 3078, 3096, 3123, 3189, 3192, 3215, 3257, 3267, 3277, 3313, 3317, 3319, 3334, 3377, 3383, 3434, 3483, 3514, 3564, 3594, 3608, 3642, 3657, 3671, 3688, 3698, 3723, 3759, 3768, 3796, 3834, 3877, 3880, 3934, 3952, 3965, 4024, 4059, 4061, 4097, 4130, 4142, 4143, 4208, 4238, 4271, 4285, 4286, 4287, 4294, 4303, 4310, 4363, 4386, 4426, 4438, 4514, 4518, 4542, 4557, 4576, 4579, 4581, 4584, 4588, 4590, 4600, 4603, 4605, 4607, 4609, 4613, 4620, 4627, 4637, 4639, 4644, 4650, 4663

In [202]:
for i in empty_indx:
    df1 = df1.drop(index=i)

In [203]:
df1.reset_index(drop=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4571 entries, 0 to 4769
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              4571 non-null   object 
 1   total_calories  4571 non-null   float64
 2   total_mass      4571 non-null   float64
 3   total_fat       4571 non-null   float64
 4   total_carb      4571 non-null   float64
 5   total_protein   4571 non-null   float64
 6   label           4571 non-null   object 
dtypes: float64(5), object(2)
memory usage: 285.7+ KB


In [204]:
df1.to_csv("/home/ritushwar/Nutritional-Assistance/metadata/dish_meta_data.csv",index=False)

In [205]:
iy, ino = 0, 0
empty_indx = []
for ro_no in range(df1.shape[0]):
    id = df1.iloc[ro_no,0]
    if id in directory_list:
        iy +=1
    else:
        ino+=1
        empty_indx.append(ro_no)

In [206]:
print(iy, ino)

4571 0


In [207]:
df1.head(10)

Unnamed: 0,id,total_calories,total_mass,total_fat,total_carb,total_protein,label
0,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,"[mixed greens, brown rice, pork]"
1,dish_1562688426,137.569992,88.0,8.256,5.19,10.297,"[chicken apple sausage, roasted potatoes]"
2,dish_1561662054,419.438782,292.0,23.838249,26.351543,25.910593,"[mixed greens, pork, apple, green beans]"
3,dish_1562008979,382.936646,290.0,22.224644,10.17357,35.345387,"[wheat berry, caesar salad, pork]"
4,dish_1560455030,20.59,103.0,0.148,4.625,0.956,"[cherry tomatoes, cucumbers]"
5,dish_1565640549,45.482903,139.0,1.568471,7.043886,2.641478,"[tomatoes, cherry tomatoes, asparagus, arugula..."
6,dish_1563207364,309.269989,271.0,13.774,30.657,15.01,"[grapes, scrambled eggs, egg whites, yam, swee..."
7,dish_1561575474,120.058434,183.0,4.966118,17.412746,2.990431,"[cauliflower, roasted potatoes, eggplant]"
8,dish_1550795690,68.119995,131.0,0.262,18.34,0.393,[apple]
9,dish_1563216717,246.007996,332.0,5.636517,16.284782,33.06871,"[chicken breast, broccoli, honeydew melons, sq..."


## Deleting the unknown directory


In [208]:
# taking all the dish id from the df1  4571
dish_id = []
for i in range(df1.shape[0]):
    id = df1.iloc[i,0]
    dish_id.append(id)

In [209]:
print(len(dish_id))
print(len(directory_list))

4571
4571


In [210]:
df1.shape[0]

4571

In [211]:
import shutil
a, b =0,0
for dir in os.listdir(path):
    if dir in dish_id:
        a+=1
    else:
        #deleted this directory
        del_dir = os.path.join(path, dir)
        shutil.rmtree(del_dir)
        b+=1

In [212]:
print(a, b)

4571 0


In [213]:
len(os.listdir(path))

4571