In [151]:
import pandas as pd
import numpy as np

In [152]:
df1 = pd.read_csv("ingredients_metadata.csv")
df1.head(10)

Unnamed: 0,ingr,id,cal/g,fat(g),carb(g),protein(g)
0,cottage cheese,1,0.98,0.043,0.034,0.11
1,strawberries,2,0.33,0.003,0.08,0.007
2,garden salad,3,0.646,0.034,0.032,0.061
3,bacon,4,5.41,0.42,0.014,0.37
4,potatoes,5,0.77,0.001,0.17,0.02
5,caesar salad,6,0.44,0.021,0.043,0.032
6,cauliflower,7,0.25,0.003,0.05,0.018
7,scrambled eggs,8,1.48,0.11,0.016,0.1
8,wild rice,9,1.19,0.004,0.25,0.05
9,steak,10,2.71,0.19,0.0,0.25


In [153]:
def split_data(row):   # input must me single row
    macro_data = row[:6]  # first six belongs to macro data
    ingr_data = row[6:]  # after that belongs to ingr data
    ingr_data = [ingr_data[i:i+7] for i in range(0, len(ingr_data), 7)]   # include from 0 to 6
    return macro_data, ingr_data

In [154]:
def get_names(ingr_data):  # input must be row of single dish
    return [item[1] for item in ingr_data]  # item[1] refers to name

In [155]:
def get_masses(ingr_data):
    return [float(item[2]) for item in ingr_data]  #item[2] refers to mass

In [156]:
def get_most_common_ingr(rows, skip=None):
    mass_map = {}
    count_map = {}
    for row in rows:
        _, ingr_data = split_data(row)
        names = get_names(ingr_data)
        masses = get_masses(ingr_data)
        for i, name in enumerate(names):
            if name == '':
                continue
            if name not in mass_map:
                mass_map[name] = 0
            mass_map[name] += masses[i]
            if name not in count_map:
                count_map[name] = 0
            count_map[name] += 1
    for item in skip:
        if item in mass_map:
            del mass_map[item]
    return mass_map, count_map

In [157]:
def calculate_data_from_ingrs(ingrs_data):   # single shell
    total_calories = 0
    total_fat = 0
    total_carb = 0
    total_protein = 0
    for ingr in ingrs_data:
        total_calories += float(ingr[3])
        total_fat += float(ingr[4])
        total_carb += float(ingr[5])
        total_protein += float(ingr[6])
    return total_calories, total_fat, total_carb, total_protein

In [158]:
def process_row(row, class_map, top_n=None):
    macro_data, ingr_data = split_data(row)
    result = {
        'id': macro_data[0],
        'total_calories': float(macro_data[1]),
        'total_mass':float(macro_data[2]),
        'total_fat':float(macro_data[3]),
        'total_carb':float(macro_data[4]),
        'total_protein':float(macro_data[5]),
    }
    if result['total_calories'] == 0:
        total_calories, total_fat, total_carb, total_protein = calculate_data_from_ingrs(ingr_data)
        #print(total_calories, total_fat, total_carb, total_protein)
        result['total_calories'] = total_calories
        result['total_fat'] = total_fat
        result['total_carb'] = total_carb
        result['total_protein'] = total_protein

    names = get_names(ingr_data)
    masses = get_masses(ingr_data)
    masses_percent = [item / result['total_mass'] for item in masses]

    filtered_names = []
    filtered_masses_percent = []
    for i, name in enumerate(names):   # only items that represents at least 5% of the total mass of the meal
        if masses_percent[i] >= 0.05:
            filtered_names.append(name)
            filtered_masses_percent.append(masses_percent[i])
    asort = np.argsort(filtered_masses_percent[::-1])
    sorted_arr = np.array(filtered_names)[asort]
    filtered = [item for item in sorted_arr if item in class_map]
    if top_n is not None:
        filtered = filtered[:top_n]
    labels = []
    for item in filtered:
        if item != '':
            labels.append(item)
    result['label'] = labels if len(labels) > 0 else np.nan
    return result

In [159]:
import csv
rows = []
with open('dish_metadata_cafe1.csv') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        filtered_row = [value for value in row if value.strip()]
        rows.append(filtered_row)

In [160]:
len(rows)

4768

In [161]:
import csv
rows = []
with open('dish_metadata_cafe1.csv') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        filtered_row = [value for value in row if value.strip()]
        rows.append(filtered_row)
with open('dish_metadata_cafe2.csv') as f:
    csvreader = csv.reader(f)
    for row in csvreader:
        filtered_row = [value for value in row if value.strip()]
        rows.append(filtered_row)

In [162]:
"""row_index = 0
ccnt = []
step = 0
for row in rows:
    if row_index >= 4768:
        row_index %= 4768
    else:
        row_index +=1
    for ele in row:
        step += 1
        if ele.startswith('ingr'):
            if step == 6:
                print(row_index)
            ccnt.append(step)
            step = 0
    step = 0 """

"row_index = 0\nccnt = []\nstep = 0\nfor row in rows:\n    if row_index >= 4768:\n        row_index %= 4768\n    else:\n        row_index +=1\n    for ele in row:\n        step += 1\n        if ele.startswith('ingr'):\n            if step == 6:\n                print(row_index)\n            ccnt.append(step)\n            step = 0\n    step = 0 "

In [163]:
skip = ['olive oil', 'salt', 'pepper', 'vinegar', 'coffee', 'plate only', 'vegetable oil', 'deprecated']

In [164]:
n = 75
mass_map, count_map = get_most_common_ingr(rows, skip)
mass_df = pd.DataFrame(mass_map.items(), columns=['ingr','mass']).set_index('ingr')
count_df = pd.DataFrame(count_map.items(), columns=['ingr', 'count']).set_index('ingr')
ingredients_df = mass_df.join(count_df).reset_index()
ingredients_df.head(20)

Unnamed: 0,ingr,mass,count
0,soy sauce,268.705771,123
1,garlic,1244.628944,1012
2,white rice,13030.895709,323
3,parsley,489.541188,523
4,onions,5991.911975,749
5,brown rice,5376.194467,239
6,apple,19397.623488,239
7,mixed greens,12893.382258,390
8,sugar,412.62529,50
9,lemon juice,862.331321,613


In [165]:
if n is not None:
    ingredients_df = ingredients_df.nlargest(n, ['mass'])
ingredients_df['mass_weight'] = ingredients_df['mass'].max()/ingredients_df['mass']
ingredients_df['count_weight'] = ingredients_df['count'].max()/ingredients_df['count']
ingredients_df.head()

Unnamed: 0,ingr,mass,count,mass_weight,count_weight
36,egg whites,47794.333297,200,1.0,3.745
48,chicken,46796.594974,400,1.021321,1.8725
33,scrambled eggs,39772.309517,331,1.201699,2.26284
81,olives,37301.340391,297,1.281303,2.521886
39,cauliflower,29629.891235,340,1.613045,2.202941


In [166]:
len(ingredients_df)

75

In [167]:
ingredients_df.to_csv('./classes.csv')

In [168]:
unique_ingredients = set(ingredients_df['ingr'])
len(unique_ingredients)

75

In [169]:
top_n = 5
items = []
for row in rows:
    processed = process_row(row, unique_ingredients, top_n)
    items.append(processed)

In [170]:
df = pd.DataFrame(items)

In [171]:
#df = df.dropna()

In [172]:
df.head()

Unnamed: 0,id,total_calories,total_mass,total_fat,total_carb,total_protein,label
0,dish_1561662216,300.794281,193.0,12.387489,28.21829,18.63397,"[mixed greens, brown rice, pork]"
1,dish_1562688426,137.569992,88.0,8.256,5.19,10.297,"[chicken apple sausage, roasted potatoes]"
2,dish_1561662054,419.438782,292.0,23.838249,26.351543,25.910593,"[mixed greens, pork, apple, green beans]"
3,dish_1562008979,382.936646,290.0,22.224644,10.17357,35.345387,"[wheat berry, caesar salad, pork]"
4,dish_1560455030,20.59,103.0,0.148,4.625,0.956,"[cherry tomatoes, cucumbers]"


In [176]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5006 entries, 0 to 5005
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              5006 non-null   object 
 1   total_calories  5006 non-null   float64
 2   total_mass      5006 non-null   float64
 3   total_fat       5006 non-null   float64
 4   total_carb      5006 non-null   float64
 5   total_protein   5006 non-null   float64
 6   label           4770 non-null   object 
dtypes: float64(5), object(2)
memory usage: 273.9+ KB


In [174]:
df2 = pd.read_csv("ingredients_metadata.csv")
df2.head(5)

Unnamed: 0,ingr,id,cal/g,fat(g),carb(g),protein(g)
0,cottage cheese,1,0.98,0.043,0.034,0.11
1,strawberries,2,0.33,0.003,0.08,0.007
2,garden salad,3,0.646,0.034,0.032,0.061
3,bacon,4,5.41,0.42,0.014,0.37
4,potatoes,5,0.77,0.001,0.17,0.02


In [175]:
df.to_csv('./dish_meta_data.csv')