In [1]:
# Script to find counts of different categories of ingredients for all cuisines and regions
# Author: Mehrin Azan


import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import csv
import numpy as np
import ssl
import json

In [2]:
recipes = pd.read_csv(r'"https://raw.githubusercontent.com/SirMaik/Visualizing-Recipes/main/static/data/data_recipe_search.csv')
categories = pd.read_csv(r'"https://raw.githubusercontent.com/SirMaik/Visualizing-Recipes/main/static/data/ingredient-categories.csv')

In [3]:
# Reduce recipe data
keepRec = ['name', 'ingredients', 'n_ingredients', 'region', 'country']
recs = recipes[keepRec]

In [4]:
# Reduce cat data
keepCat = ['replaced', 'ingredient_category']
cats2 = categories[keepCat]

# Remove 'replaced' duplicates 
cats2.drop_duplicates(subset=['replaced'], inplace=True)

#set column country to index
cats2 = cats2.set_index('replaced')
#split by comma
cats= cats2.apply(lambda x: x.str.split(',').explode())
#delete square brackets
cats = cats.ingredient_category.str.strip('[]')
#delete  '
cats= cats.str.replace("'", "")
#delete taps
cats = cats.str.strip()

# Store as dataframe
cats = pd.DataFrame(cats)

cats=cats.reset_index()

cats.head(3)

Unnamed: 0,replaced,ingredient_category
0,lettuce,vegetable
1,french vanilla pudding and pie filling mix,dish
2,stove top stuffing mix,mix


In [5]:
# Get country names in list
allCountries = recs.copy()
allCountries = allCountries['country']
allCountries.drop_duplicates(inplace = True)
allCountries = allCountries.tolist()


# Subset with countries and regions
keepCR = ['region','country']
allCountriesAndRegions = recs[keepCR]
allCountriesAndRegions.drop_duplicates(inplace = True)
allCountriesAndRegions = allCountriesAndRegions.reset_index(drop=True)

allCountriesAndRegions.dropna(inplace=True)


In [6]:
keepCR = ['region','country']
allCountriesAndRegions = recs[keepCR]
allCountriesAndRegions.drop_duplicates(subset=['country'],inplace=True)
allCountriesAndRegions

Unnamed: 0,region,country
0,central-american,mexican
1,north-american,american
4,north-american,canadian
10,indian-subcontinent,indian
13,european,german
...,...,...
3160,asian,korean
13627,central-american,honduran
15894,african,namibian
27884,asian,laotian


In [7]:
# Get all categories in list
allCats = cats.copy()
allCats = allCats['ingredient_category']
allCats.drop_duplicates(inplace=True)
allCats = allCats.tolist()


In [8]:
# New dataframe containing output df: counts of category occurences
countsCats = pd.DataFrame(allCountriesAndRegions)

# Fill dataframe ith zeros
for x in allCats:
    countsCats[x] = 0




In [9]:
# Get only ingredients
keep= ['country', 'ingredients']
ingredients = recs[keep]
#set column country to index
ingredients = ingredients.set_index('country')
#split by comma
ing= ingredients.apply(lambda x: x.str.split(',').explode())
#delete square brackets
ing = ing.ingredients.str.strip('[]')
#delete  '
ing= ing.str.replace("'", "")
#delete taps
ing = ing.str.strip()

# Store as dataframe
ing = pd.DataFrame(ing)

#get counts of ingredients per country
grouped_list= ing.groupby("country")["ingredients"].value_counts()
#store as df
grouped= pd.DataFrame(grouped_list)
#rename columns
grouped.columns = ['ingredient_counts']
grouped=grouped.reset_index()




In [10]:
# Loop through each country and count ingredients for each category
# Fill dataframe

cRow = 0


while cRow < len(allCountriesAndRegions):

    # Sort into countries
    # New df with only current country ingredients
    currentCountry = countsCats['country'].values[cRow]
    # String of current country
    dfCurrentCountry = grouped[grouped['country']==currentCountry]

    cIngred = 0

    while cIngred < len(dfCurrentCountry):

        # String of current ingredient
        currentIng = dfCurrentCountry['ingredients'].values[cIngred]
        currentIngCounts = dfCurrentCountry['ingredient_counts'].values[cIngred]

        # Find indices of where ingredient exists in category dataframe
        for group in cats: 
            indices = list(np.where(cats["replaced"]==currentIng)[0])

        # Get category name from 'ingredient_category' column of category dataframe
        currentIngCat = cats['ingredient_category'].values[indices]
        currentIngCat = str(currentIngCat)
        currentIngCat = currentIngCat.strip('[]')
        currentIngCat = currentIngCat.replace("'", "")

        # Check for plural, remove 's' from end if plural
        # Skip if category for ingredient does not exist
        if currentIngCat == '':
            if currentIng[len(currentIng)-1] == 's':
                currentIng = currentIng[:-1]

                # String of current ingredient
                currentIng = dfCurrentCountry['ingredients'].values[cIngred]
                currentIngCounts = dfCurrentCountry['ingredient_counts'].values[cIngred]

                # Find indices of where ingredient exists in category dataframe
                for group2 in cats: 
                    indices = list(np.where(cats["replaced"]==currentIng)[0])

            
                if indices != []:
                    currentIngCat = cats['ingredient_category'].values[indices]
                    currentIngCat = str(currentIngCat)
                    currentIngCat = currentIngCat.strip('[]')
                    currentIngCat = currentIngCat.replace("'", "")
                    
                    # Place counts value in relevant category in output df, add onto previous value if exists
                    countsCats.iloc[cRow, countsCats.columns.get_loc(currentIngCat)] = currentIngCounts + countsCats.iloc[cRow, countsCats.columns.get_loc(currentIngCat)]
                    
                else: 
                    # Skip
                    cIngred = cIngred + 1
                    
            else:
                # Skip
                cIngred = cIngred + 1
        else:
            # Place counts value in relevant category in output df, add onto previous value if exists
            countsCats.iloc[cRow, countsCats.columns.get_loc(currentIngCat)] = currentIngCounts + countsCats.iloc[cRow, countsCats.columns.get_loc(currentIngCat)]
            cIngred = cIngred + 1
    cRow = cRow + 1





In [38]:
countsCats

Unnamed: 0,region,country,vegetable,dish,mix,dairy,sauce,herb,confectionery,spread,...,seasoning,vegetarian,seafood,vegan,tools,sweetener,seed,flavoring,fungi,flower
0,central-american,mexican,6997,607,138,5033,2874,2072,173,322,...,1516,231,258,59,108,1117,101,216,36,0
1,north-american,american,28272,3779,2093,32653,9269,11198,1312,2975,...,7853,3653,2234,440,638,17341,1086,1735,447,27
4,north-american,canadian,3037,268,141,3206,736,1351,119,336,...,759,469,219,26,39,2107,166,195,82,5
10,indian-subcontinent,indian,2601,53,14,1269,190,509,24,93,...,276,143,83,205,15,600,115,239,15,1
13,european,german,949,71,60,1082,65,295,50,61,...,201,196,9,4,12,678,89,54,17,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3160,asian,korean,287,12,2,8,185,3,1,1,...,28,13,10,13,5,192,1,9,8,0
13627,central-american,honduran,16,1,0,12,7,7,0,3,...,2,1,4,1,0,3,0,2,0,0
15894,african,namibian,4,0,0,2,2,1,0,1,...,1,0,0,1,0,1,0,0,1,0
27884,asian,laotian,16,1,0,0,10,6,0,1,...,3,1,3,2,0,12,0,0,1,0


In [39]:
countsSorted = countsCats.sort_values('region')
countsSorted = countsSorted.drop(columns = ['tools'])
counts = countsSorted.reset_index(drop=True, inplace=True)



In [None]:
# Export dataframe to .csv file
countsSorted.to_csv('category_counts.csv', index=False)