# Fetch Country

In [2]:
import numpy as np
import pandas as pd
from rapidfuzz import fuzz, process
import os
import re
import ast
import json
from typing import List, Dict, Tuple
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.corpus import stopwords
from pathlib import Path

import spacy

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohammed_elamine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Download a small spacy english deterministic model `en_core_web_sm` and load it into `nlp` variable

In [85]:
! python -m spacy download en_core_web_sm

# Load English NLP model (make sure to run `python -m spacy download en_core_web_sm` first)
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m5.7 MB/s[0m  [33m0:00:02[0m eta [36m0:00:01[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Set some constant variables for file paths:

In [3]:
# Let's find the repository root directory
CURRENT_DIR = Path().resolve()
dir_path = CURRENT_DIR
dir_name = dir_path.name
while True:
    dir_path = dir_path.parent
    dir_name = dir_path.name
    if dir_name == "mangetamain":
        # Check if we are in a git repository
        if (dir_path / ".git").exists():
            break
REPO_ROOT = dir_path

# Then set the data directory and file paths
DATA_DIR = REPO_ROOT / "data"
RAW_DATA_PATH = DATA_DIR / "RAW_recipes.csv"
COUNTRIES_FILE_PATH = DATA_DIR / "countries.json"

## Import data

In [4]:
recipes_df = pd.read_csv(RAW_DATA_PATH)
recipes_df.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


In [274]:
recipes_df.loc[recipes_df['tags'] == "['']", 'tags'].head(3)

882     ['']
1257    ['']
1450    ['']
Name: tags, dtype: object

In [11]:
def extract_list_floats(str_l: str) -> List[float]:
    """Extracts a list of floats from a string representation of a list.

    Returns:
        list: A list of floats extracted from the input string.
    """
    if pd.isna(str_l) or str_l == '':
        return []
    try:
        list_floats = ast.literal_eval(str_l)
        if not isinstance(list_floats, list):
            return []
        # Convert to floats and remove non-convertible entries
        cleaned = []
        for x in list_floats:
            try:
                cleaned.append(float(x))
            except (ValueError, TypeError):
                continue
        return cleaned
    except (ValueError, SyntaxError):
        # If literal_eval fails, return empty list
        return []

In [12]:
extract_list_floats(recipes_df.loc[0, 'nutrition'])

[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]

In [16]:
col_no_nans = recipes_df['nutrition'].dropna()
print(col_no_nans.values[0])
recipes_df['nutrition'].values[0]

[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]


'[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]'

In [19]:
test_x = '[5.0, 10.0, 15.0, 20.0]'
print(type(ast.literal_eval(test_x)[0]))

test_y = "['banana', 'apple', 'orange']"
print(type(ast.literal_eval(test_y)[0]))


<class 'float'>
<class 'str'>


In [21]:
recipes_df['tags'].name

'tags'

In [22]:
test_x = ['banana', 'apple', 'orange']
tuple(test_x)

('banana', 'apple', 'orange')

## Clean the `tags` feature

In [275]:
def clean_tags_batch_old(tags: List[str], lengths: List[int]) -> List[str]:
    """Cleans a list of tags by applying the clean_tag function to each tag.

    Args:
        tags (List[str]): A list of strings containing tags to be cleaned.

    Returns:
        List[str]: A list of strings containing cleaned tags.
    """
    cleaned = []

    docs = nlp.pipe(tags, batch_size=1000, n_process=4)
    cleaned_tags = []

    for i, doc in enumerate(docs, start=1):
        if doc is None:
            cleaned_tags.append(None)
            # sub_cleaned.append(None)
            continue
        # tokens = []
        # for t in doc:
        #     if t.dep_ == "amod":  # skip modifying adjectives
        #         continue
        #     if t.pos_ in ["PUNCT", "SYM", "SPACE", "ADV", "ADP", "X"]:
        #         continue  # skip these tokens
        #     tokens.append(t.lemma_)
        tokens = [t.lemma_ for t in doc if not t.is_stop and t.pos_ in ["NOUN", "ADJ", "NUM", "PROPN"]]
        cleaned_tags.append("-".join(tokens) if tokens else None)

    idx = 0
    for length in lengths:
        cleaned.append(cleaned_tags[idx : idx + length])
        idx += length

    return cleaned

def clean_tags_batch(tags: List[str]) -> List[str]:
    """Cleans a list of tags by applying the clean_tag function to each tag.

    Args:
        tags (List[str]): A list of strings containing tags to be cleaned.

    Returns:
        List[str]: A list of strings containing cleaned tags.
    """
    cleaned = []
    docs = nlp.pipe(tags, batch_size=1000, n_process=4)

    for i, doc in enumerate(docs, start=1):
        if doc is None:
            cleaned.append(None)
            continue
        tokens = [t.lemma_ for t in doc if not t.is_stop and t.pos_ in ["NOUN", "ADJ", "NUM", "PROPN"]]
        cleaned.append("-".join(tokens) if tokens else None)

    return cleaned

def clean_tag_advanced(tag):
    """Cleans the input tag by applying various preprocessing steps.

    Args:
        tag (str): The input tag to be cleaned.

    Returns:
        str: The cleaned tag or None if the tag is invalid.
    """
    
    if pd.isna(tag) or tag == '':
        return None  # mark for deletion

    # If it contains a space ‚Üí treat as sentence
    if " " in tag:
        doc = nlp(tag)
        # Remove stopwords
        tokens = [token.lemma_ for token in doc \
                  if (not token.is_stop) and (token.pos_ in ["NOUN", "ADJ", "NUM"])]
        # Join with dashes
        return "-".join(tokens)
    
    tag = re.sub(r'-', ' ', tag)
    doc = nlp(tag)
    # If it contains a stopword only ‚Üí mark for deletion
    if len(doc) == 1 and doc[0].is_stop:
        return None  # mark for deletion
    tokens = []
    for token in doc:
        # If it is a modifying adjective skip it
        if token.dep_ == "amod": #and token.head.pos_ == "NOUN":
            continue
        if token.pos_ in ["PUNCT", "SYM", "SPACE", "ADV", "ADP", "X"]:
            continue  # skip these tokens
        tokens.append(token.lemma_) 
    return "-".join(tokens)

def clean_tag(tag):
    """Cleans the input tag by applying various preprocessing steps.

    Args:
        tag (str): The input tag to be cleaned.

    Returns:
        str: The cleaned tag or None if the tag is invalid.
    """
    
    if pd.isna(tag) or tag == '':
        return None  # mark for deletion

    # If it contains a space ‚Üí treat as sentence
    if " " in tag:
        tokens = tag.split()
        clean_tokens = []
        for token in tokens:
            token = token.strip()
            if token in stop_words:
                continue
            clean_tokens.append(token)
        # Join with dashes
        return "-".join(clean_tokens)

def preprocess_list_string(list_str: List[str], clean_method: callable = None) -> List[str]:
    """Preprocesses a string representation of a list by cleaning each string in the list.

    Args:
        list_str (List[str]): A list of strings to be cleaned.
        clean_method (callable, optional): A function to clean each string in the list. Defaults to clean_tag.

    Raises:
        ValueError: If the input string does not represent a list.

    Returns:
        list: A list of unique cleaned strings extracted from the input string.
    """
    if list_str is None:
        return []
    if not isinstance(list_str, list):
        raise ValueError("The input is not a list.")
    if len(list_str) == 0:
        return []
    if clean_method is None:
        return list(set(list_str))  # return unique items only
    cleaned = []
    for item in list_str:
        cleaned_item = clean_method(item)
        if cleaned_item is not None:
            cleaned.append(cleaned_item)
    return cleaned

def extract_list_string(s: str) -> List[str]:
    """Extracts a list of strings from a string representation of a list.

    Args:
        s (str): A string representation of a list.

    Returns:
        list: A list of strings extracted from the input string.
    """
    if pd.isna(s) or s == '':
        return []
    list_strings = ast.literal_eval(s)
    if len(list_strings) == 1 and list_strings[0] == '':
        return []
    return list_strings

In [276]:
recipes_df['tags'] = recipes_df['tags'].fillna('[]')
recipes_df['tags'] = recipes_df['tags'].str.lower().str.strip()
recipes_df['tags'] = recipes_df['tags'].apply(lambda s: re.sub(r'[^a-z0-9\s\'\[\]\,-]', '', s))
recipes_df['tags'] = recipes_df['tags'].apply(lambda s: s if not re.search(r"less_than|greater_than|sql", s) else None)
recipes_df['tags_list'] = recipes_df['tags'].apply(extract_list_string)

In [277]:
tags_lengths = recipes_df['tags_list'].apply(len).tolist()

## Extract `tags` classes

In [278]:
mlb = MultiLabelBinarizer()
mlb.fit(recipes_df['tags_list'])

0,1,2
,classes,
,sparse_output,False


In [279]:
mlb.classes_

array(['1-day-or-more', '15-minutes-or-less', '3-steps-or-less',
       '30-minutes-or-less', '4-hours-or-less', '5-ingredients-or-less',
       '60-minutes-or-less', 'a1-sauce', 'african', 'american',
       'amish-mennonite', 'angolan', 'appetizers', 'apples',
       'april-fools-day', 'argentine', 'artichoke', 'asian', 'asparagus',
       'australian', 'austrian', 'avocado', 'bacon', 'baja',
       'baked-beans', 'baking', 'bananas', 'bar-cookies', 'barbecue',
       'bass', 'bean-soup', 'beans', 'beans-side-dishes', 'bear', 'beef',
       'beef-barley-soup', 'beef-crock-pot', 'beef-kidney', 'beef-liver',
       'beef-organ-meats', 'beef-ribs', 'beef-sauces', 'beef-sausage',
       'beginner-cook', 'beijing', 'belgian', 'berries', 'beverages',
       'birthday', 'biscotti', 'bisques-cream-soups', 'black-bean-soup',
       'black-beans', 'blueberries', 'bok-choys', 'brazilian',
       'bread-machine', 'bread-pudding', 'breads', 'breakfast',
       'breakfast-casseroles', 'breakfast-e

In [280]:
unique_tags = mlb.classes_.tolist()
print(f"Number of unique tags: {len(unique_tags)}")

tag_to_id = {tag: idx for idx, tag in enumerate(unique_tags)}

Number of unique tags: 550


## Countries

In [315]:
countries_data = json.load(open(COUNTRIES_FILE_PATH, "r", encoding="utf-8"))
countries_data[0]

{'name': {'common': 'Aruba',
  'official': 'Aruba',
  'native': {'nld': {'official': 'Aruba', 'common': 'Aruba'},
   'pap': {'official': 'Aruba', 'common': 'Aruba'}}},
 'tld': ['.aw'],
 'cca2': 'AW',
 'ccn3': '533',
 'cca3': 'ABW',
 'cioc': 'ARU',
 'independent': False,
 'status': 'officially-assigned',
 'unMember': False,
 'currencies': {'AWG': {'name': 'Aruban florin', 'symbol': '∆í'}},
 'idd': {'root': '+2', 'suffixes': ['97']},
 'capital': ['Oranjestad'],
 'altSpellings': ['AW'],
 'region': 'Americas',
 'subregion': 'Caribbean',
 'languages': {'nld': 'Dutch', 'pap': 'Papiamento'},
 'translations': {'ara': {'official': 'ÿ£ÿ±Ÿàÿ®ÿß', 'common': 'ÿ£ÿ±Ÿàÿ®ÿß'},
  'bre': {'official': 'Aruba', 'common': 'Aruba'},
  'ces': {'official': 'Aruba', 'common': 'Aruba'},
  'deu': {'official': 'Aruba', 'common': 'Aruba'},
  'est': {'official': 'Aruba', 'common': 'Aruba'},
  'fin': {'official': 'Aruba', 'common': 'Aruba'},
  'fra': {'official': 'Aruba', 'common': 'Aruba'},
  'hrv': {'official': 'Ar

This dataset contains a list of world countries, as defined by [ISO Standard 3166-1](https://en.wikipedia.org/wiki/ISO_3166-1), in JSON.

We're more interested in the below information for each country:
* `name`
    - **common** - common name in english
    - **official** - official name in english
* `region`
* `suberegion`
* `demonyms` - name of residents, translated & genderized
    - **key**: three-letter ISO 639-3 language code
    - **value**: genderized demonym object
        * *key*: f (female) or m (male)
        * *value*: genderized demonym translation

In [224]:
countries_data_compact = []
for country_dict in countries_data:
    country_info = [
            country_dict.get("name", {}).get("common", ""),
            country_dict.get("name", {}).get("official", ""),
            country_dict.get("region", ""),
            country_dict.get("subregion", country_dict.get("region", "")),
            country_dict.get("demonyms", {}).get("eng", {}).get("f", ""),
    ]
    countries_data_compact.append(country_info)
for country_info in countries_data_compact[:3]:
    print(country_info)

['Aruba', 'Aruba', 'Americas', 'Caribbean', 'Aruban']
['Afghanistan', 'Islamic Republic of Afghanistan', 'Asia', 'Southern Asia', 'Afghan']
['Angola', 'Republic of Angola', 'Africa', 'Middle Africa', 'Angolan']


In [225]:
countries_df = pd.DataFrame(countries_data_compact, columns=[
    "common_name",
    "official_name",
    "region",
    "subregion",
    "demonym"
])
countries_df.reset_index(names=["country_id"], inplace=True)
countries_df.head(3)

Unnamed: 0,country_id,common_name,official_name,region,subregion,demonym
0,0,Aruba,Aruba,Americas,Caribbean,Aruban
1,1,Afghanistan,Islamic Republic of Afghanistan,Asia,Southern Asia,Afghan
2,2,Angola,Republic of Angola,Africa,Middle Africa,Angolan


Fill missing subregions with region values

In [226]:
countries_df.loc[countries_df['subregion'] == '', 'subregion'] = countries_df.loc[countries_df['subregion'] == '', 'region']

In [227]:
print(f"There are {len(countries_df)} countries in the DataFrame.")

unique_regions = countries_df['region'].unique()
print(f"\nThere are {len(unique_regions)} unique regions:")
print(unique_regions)

unique_subregions = countries_df['subregion'].unique()
print(f"\nThere are {len(unique_subregions)} unique subregions:")
print(unique_subregions)

region_subregion_mapping = countries_df[['region', 'subregion']].drop_duplicates().groupby('region')['subregion'].apply(list).reset_index().set_index('region')['subregion'].to_dict()
print("\nRegion to Subregion mapping:")
for region, subregions in region_subregion_mapping.items():
    print(f"{region}: {subregions}")

There are 250 countries in the DataFrame.

There are 6 unique regions:
['Americas' 'Asia' 'Africa' 'Europe' 'Oceania' 'Antarctic']

There are 25 unique subregions:
['Caribbean' 'Southern Asia' 'Middle Africa' 'Northern Europe'
 'Southeast Europe' 'Southern Europe' 'Western Asia' 'South America'
 'Polynesia' 'Antarctic' 'Australia and New Zealand' 'Central Europe'
 'Eastern Africa' 'Western Europe' 'Western Africa' 'Eastern Europe'
 'Central America' 'North America' 'South-Eastern Asia' 'Southern Africa'
 'Eastern Asia' 'Northern Africa' 'Melanesia' 'Micronesia' 'Central Asia']

Region to Subregion mapping:
Africa: ['Middle Africa', 'Eastern Africa', 'Western Africa', 'Southern Africa', 'Northern Africa']
Americas: ['Caribbean', 'South America', 'Central America', 'North America']
Antarctic: ['Antarctic']
Asia: ['Southern Asia', 'Western Asia', 'South-Eastern Asia', 'Eastern Asia', 'Central Asia']
Europe: ['Northern Europe', 'Southeast Europe', 'Southern Europe', 'Central Europe', 'West

The **Oceania** region includes Australia, New Zealand, and the islands of the Pacific Ocean, such as Fiji, Papua New Guinea, Samoa, and many others.

In [228]:
region_to_region_id = dict(zip(unique_regions, list(range(len(unique_regions)))))
print(f"Region ID mapping: {region_to_region_id}")
countries_df['region_id'] = countries_df['region'].map(region_to_region_id)
display(countries_df.head(3))

# Normalize the region names to lowercase
region_to_region_id = {reg.lower(): reg_id for reg, reg_id in region_to_region_id.items()}
# Create reverse mapping
region_id_to_region = {reg_id: reg for reg, reg_id in region_to_region_id.items()}
# Create country_id to region_id mapping
country_id_to_region_id = {row['country_id']: row['region_id'] for _, row in countries_df.iterrows()}

print(f"Region ID mapping (lowercase): {region_to_region_id}")
print(f"Region ID to Region mapping: {region_id_to_region}")

Region ID mapping: {'Americas': 0, 'Asia': 1, 'Africa': 2, 'Europe': 3, 'Oceania': 4, 'Antarctic': 5}


Unnamed: 0,country_id,common_name,official_name,region,subregion,demonym,region_id
0,0,Aruba,Aruba,Americas,Caribbean,Aruban,0
1,1,Afghanistan,Islamic Republic of Afghanistan,Asia,Southern Asia,Afghan,1
2,2,Angola,Republic of Angola,Africa,Middle Africa,Angolan,2


Region ID mapping (lowercase): {'americas': 0, 'asia': 1, 'africa': 2, 'europe': 3, 'oceania': 4, 'antarctic': 5}
Region ID to Region mapping: {0: 'americas', 1: 'asia', 2: 'africa', 3: 'europe', 4: 'oceania', 5: 'antarctic'}


In [229]:
countries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   country_id     250 non-null    int64 
 1   common_name    250 non-null    object
 2   official_name  250 non-null    object
 3   region         250 non-null    object
 4   subregion      250 non-null    object
 5   demonym        250 non-null    object
 6   region_id      250 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 13.8+ KB


In [230]:
common_name_to_country_id = {
    row['common_name'].lower(): row['country_id'] for _, row in countries_df.iterrows()
}
official_name_to_country_id = {
    row['official_name'].lower(): row['country_id'] for _, row in countries_df.iterrows()
}
demonym_to_country_id = {
    row['demonym'].lower(): row['country_id'] for _, row in countries_df.iterrows()
}
print(f"id of France: {common_name_to_country_id['france']}")
print(f"id of French Republic: {official_name_to_country_id['french republic']}")
print(f"id of French: {demonym_to_country_id['french']}")

id of France: 76
id of French Republic: 76
id of French: 76


## Extract country from tags

In [None]:
def fetch_country(list_str: List[str], threshold: int = 80) -> int:
    """Gets the country ID corresponding to a given tag.

    Args:
        list_str (List[str]): The input list of strings.

    Returns:
        int: The country ID if found, otherwise -1.
    """
    if not list_str:
        return -1
    
    common_name_to_country_id_keys = list(common_name_to_country_id.keys())
    official_name_to_country_id_keys = list(official_name_to_country_id.keys())
    demonym_to_country_id_keys = list(demonym_to_country_id.keys())
    mappings = [
        (common_name_to_country_id, common_name_to_country_id_keys),
        (official_name_to_country_id, official_name_to_country_id_keys),
        (demonym_to_country_id, demonym_to_country_id_keys),
    ]

    # Prepare queries
    queries = [s.lower() for s in list_str]

    for mapping in mappings:
        ref_dict, ref_names = mapping
        # Compute similarity matrix (len(queries) x len(choices))
        scores = process.cdist(queries, ref_names, scorer=fuzz.ratio)

        # Find best match for all queries
        best_idx_per_query = np.argmax(scores, axis=1)
        best_scores_per_query = scores[np.arange(len(queries)), best_idx_per_query]
        best_score = np.max(best_scores_per_query)
        best_idx = best_idx_per_query[np.argmax(best_scores_per_query)]
        if best_score >= threshold:
            return ref_dict[ref_names[best_idx]]
    return -1

def fetch_region(list_str: List[str], threshold: int = 80) -> int:
    """Gets the region ID corresponding to a given tag.

    Args:
        list_str (List[str]): The input list of strings.

    Returns:
        int: The region ID if found, otherwise -1.
    """
    if not list_str:
        return -1
    
    region_to_region_id_keys = list(region_to_region_id.keys())

    # Prepare queries
    queries = [s.lower() for s in list_str]

    # Compute similarity matrix (len(queries) x len(choices))
    scores = process.cdist(queries, region_to_region_id_keys, scorer=fuzz.ratio)

    # Find best match for all queries
    best_idx_per_query = np.argmax(scores, axis=1)
    best_scores_per_query = scores[np.arange(len(queries)), best_idx_per_query]
    best_score = np.max(best_scores_per_query)
    best_idx = best_idx_per_query[np.argmax(best_scores_per_query)]
    if best_score >= threshold:
        return region_to_region_id[region_to_region_id_keys[best_idx]]
    
    return -1

def fetch_country_region(list_str: List[str]) -> Tuple[int, int]:
    """Gets the country ID and region ID corresponding to a given tag.

    Args:
        * `list_str (List[str])`: The input list of strings.

    Returns:
        * `Tuple[int, int]`: A tuple containing the country ID and region ID.

    Example:
    temp_df = recipes_df.copy()\n
    temp_df['country'], temp_df['region'] = zip(*temp_df['tags_list'].apply(fetch_country_region))\n
    temp_df[['country', 'region']].head(5)\n
    	country	region\n
    0	144	-1\n
    1	235	-1\n
    2	-1	-1\n
    3	-1	-1\n
    4	235	-1\n

    """
    if not list_str:
        return (-1, -1)
    temp_return = (-1, -1)
    for s in list_str:
        s_lower = s.lower()
        if s_lower in common_name_to_country_id:
            return (common_name_to_country_id[s_lower], temp_return[1])
        elif s_lower in official_name_to_country_id:
            return (official_name_to_country_id[s_lower], temp_return[1])
        elif s_lower in demonym_to_country_id:
            return (demonym_to_country_id[s_lower], temp_return[1])
        elif s_lower in region_to_region_id:
            # Found region, store it but keep looking for country
            temp_return = (-1, region_to_region_id[s_lower])
    return temp_return

In [288]:
recipes_df['country'] = recipes_df['tags_list'].map(fetch_country)
recipes_df[['tags_list', 'country']].head(5)

Unnamed: 0,tags_list,country
0,"[60-minutes-or-less, time-to-make, course, mai...",144
1,"[30-minutes-or-less, time-to-make, course, mai...",235
2,"[time-to-make, course, preparation, main-dish,...",43
3,"[60-minutes-or-less, time-to-make, course, mai...",-1
4,"[weeknight, time-to-make, course, main-ingredi...",235


In [301]:
print(f"There are {(recipes_df['country'] == -1).sum()} recipes with unknown country (-1).")
print(f"They represent {((recipes_df['country'] == -1).sum() / len(recipes_df)) * 100:.2f}% of the total recipes.")

There are 143793 recipes with unknown country (-1).
They represent 62.08% of the total recipes.


In [303]:
recipes_df['region'] = recipes_df['tags_list'].map(fetch_region)
print(f"The number of recipes with a known region are: {len(recipes_df.loc[recipes_df['region'] != -1])}")
print(f"They represent {(len(recipes_df.loc[recipes_df['region'] != -1]) / len(recipes_df)) * 100:.2f}% of the total recipes.")

print("\nThe number of recipes with a known region but unknown country: " + \
      str(len(recipes_df.loc[(recipes_df['country'] == -1) & (recipes_df['region'] != -1)])))
print(f"They represent {(len(recipes_df.loc[(recipes_df['country'] == -1) & (recipes_df['region'] != -1)]) / len(recipes_df)) * 100:.2f}% of the total recipes.")
recipes_df[['tags_list', 'country', 'region']].head(5)

The number of recipes with a known region are: 69304
They represent 29.92% of the total recipes.

The number of recipes with a known region but unknown country: 9548
They represent 4.12% of the total recipes.


Unnamed: 0,tags_list,country,region
0,"[60-minutes-or-less, time-to-make, course, mai...",144,-1
1,"[30-minutes-or-less, time-to-make, course, mai...",235,0
2,"[time-to-make, course, preparation, main-dish,...",43,-1
3,"[60-minutes-or-less, time-to-make, course, mai...",-1,-1
4,"[weeknight, time-to-make, course, main-ingredi...",235,0


Infer region from country

In [304]:
recipes_df.loc[recipes_df['region']==-1, 'region'] = \
    recipes_df.loc[recipes_df['region']==-1, 'country'].map(
        lambda r: country_id_to_region_id.get(r, -1)
        )
print(f"The number of recipes with a known region after inferring from country are: {len(recipes_df.loc[recipes_df['region'] != -1])}")
print(f"They represent {(len(recipes_df.loc[recipes_df['region'] != -1]) / len(recipes_df)) * 100:.2f}% of the total recipes.")

The number of recipes with a known region after inferring from country are: 97392
They represent 42.05% of the total recipes.


In [286]:
# recipes_df.to_csv('/Users/mohammed_elamine/Documents/00 - AA - Learning/Telecom Paris - MS Data/1st Period/Kit Big Data/mangetamain/data/recipes_with_countries.csv', index=False)

## Extract countries from `name` and `description`

In [305]:
print(f"There are {recipes_df['name'].isna().sum()} missing values in the 'name' column.")
print(f"There are {recipes_df['description'].isna().sum()} missing values in the 'description' column.")

There are 1 missing values in the 'name' column.
There are 4979 missing values in the 'description' column.


In [306]:
recipes_df[['name', 'description']] = recipes_df[['name', 'description']].fillna('')

In [307]:
recipes_df['name_list'] = recipes_df['name'].str.strip().str.lower().str.split(expand=False)
recipes_df['description_list'] = recipes_df['description'].str.strip().str.lower().str.split(expand=False)
recipes_df[['name', 'name_list', 'description', 'description_list']].head(3)

Unnamed: 0,name,name_list,description,description_list
0,arriba baked winter squash mexican style,"[arriba, baked, winter, squash, mexican, style]",autumn is my favorite time of year to cook! th...,"[autumn, is, my, favorite, time, of, year, to,..."
1,a bit different breakfast pizza,"[a, bit, different, breakfast, pizza]",this recipe calls for the crust to be prebaked...,"[this, recipe, calls, for, the, crust, to, be,..."
2,all in the kitchen chili,"[all, in, the, kitchen, chili]",this modified version of 'mom's' chili was a h...,"[this, modified, version, of, 'mom's', chili, ..."


In [308]:
# First try to get country/region from name_list for entries with country == -1 and region == -1
recipes_df.loc[recipes_df['country'] == -1, 'country'] = recipes_df.loc[recipes_df['country'] == -1, 'name_list'].map(fetch_country)
recipes_df.loc[recipes_df['region'] == -1, 'region'] = recipes_df.loc[recipes_df['region'] == -1, 'name_list'].map(fetch_region)
# Then try to get country/region from description_list for entries with country == -1 and region == -1
recipes_df.loc[recipes_df['country'] == -1, 'country'] = recipes_df.loc[recipes_df['country'] == -1, 'description_list'].map(fetch_country)
recipes_df.loc[recipes_df['region'] == -1, 'region'] = recipes_df.loc[recipes_df['region'] == -1, 'description_list'].map(fetch_region)
recipes_df[['country', 'region']].head(5)

Unnamed: 0,country,region
0,144,0
1,235,0
2,43,0
3,63,-1
4,235,0


In [309]:
recipes_df.loc[recipes_df['region']==-1, 'region'] = \
    recipes_df.loc[recipes_df['region']==-1, 'country'].map(
        lambda r: country_id_to_region_id.get(r, -1)
        )

In [310]:
print("### After extracting from name and description ###")
print(f"There are only {(recipes_df['country'] == -1).sum()} recipes with unknown country (-1) left.")
print(f"They now represent {((recipes_df['country'] == -1).sum() / len(recipes_df)) * 100:.2f}% of the total recipes.")

print(f"\nThere are now {(recipes_df['region'] == -1).sum()} recipes with unknown region (-1).")
print(f"They now represent {((recipes_df['region'] == -1).sum() / len(recipes_df)) * 100:.2f}% of the total recipes.")

### After extracting from name and description ###
There are only 95745 recipes with unknown country (-1) left.
They now represent 41.33% of the total recipes.

There are now 90309 recipes with unknown region (-1).
They now represent 38.99% of the total recipes.


## Country Stats

### Univariate

In [316]:
print("Total number of recipes:", len(recipes_df))
print("Total number of recipes with country information:", (recipes_df['country'] != -1).sum())
print("Total number of recipes without country information:", (recipes_df['country'] == -1).sum())

Total number of recipes: 231637


KeyError: 'country'

In [None]:
recipe_countries_df = recipes_df.copy()
recipe_countries_df = recipe_countries_df.loc[recipe_countries_df['country'] != -1]
# Map country IDs to country names
recipe_countries_df['country_name'] = recipe_countries_df['country'].map(
    lambda cid: countries_df.loc[countries_df['country_id'] == cid, 'common_name'].values[0]
)
# Map country IDs to regions
recipe_countries_df['region'] = recipe_countries_df['country'].map(
    lambda cid: countries_df.loc[countries_df['country_id'] == cid, 'region'].values[0]
)
# Map country IDs to subregions
recipe_countries_df['subregion'] = recipe_countries_df['country'].map(
    lambda cid: countries_df.loc[countries_df['country_id'] == cid, 'subregion'].values[0]
)
display(recipe_countries_df[['id', 'country', 'country_name', 'region', 'subregion']].head(5))

print("Number of unique countries with recipes:", recipe_countries_df['country'].nunique())
print("\nNumber of unique regions with recipes:", recipe_countries_df['region'].nunique())
display(recipe_countries_df['region'].value_counts())
print("\nNumber of unique subregions with recipes:", recipe_countries_df['subregion'].nunique())
display(recipe_countries_df['subregion'].value_counts())

Unnamed: 0,id,country,country_name,region,subregion
0,137739,144,Mexico,Americas,North America
1,31490,235,United States,Americas,North America
4,44061,235,United States,Americas,North America
5,5289,235,United States,Americas,North America
6,25274,40,Canada,Americas,North America


Number of unique countries with recipes: 59

Number of unique regions with recipes: 5


region
Americas    40321
Europe      16888
Asia         5400
Africa       3903
Oceania      2839
Name: count, dtype: int64


Number of unique subregions with recipes: 19


subregion
North America                39397
Southern Europe              10123
Western Europe                4068
Australia and New Zealand     2839
Eastern Africa                2749
Eastern Asia                  2617
South-Eastern Asia            2025
Northern Europe               1482
Northern Africa               1093
Central Europe                 907
South America                  668
Western Asia                   623
Eastern Europe                 308
Caribbean                      216
Southern Asia                  135
Central America                 40
Western Africa                  35
Middle Africa                   20
Southern Africa                  6
Name: count, dtype: int64

In [None]:
nrecipes_by_country = recipe_countries_df['country_name'].value_counts().reset_index()
nrecipes_by_country.columns = ['country_name', 'nrecipes']
display(nrecipes_by_country.head(10))

print(f"The country with the most recipes is \"{nrecipes_by_country.loc[0, 'country_name']}\" with {nrecipes_by_country.loc[0, 'nrecipes']} recipes.")
print(f"The country with the least recipes is \"{nrecipes_by_country.loc[len(nrecipes_by_country)-1, 'country_name']}\" with {nrecipes_by_country.loc[len(nrecipes_by_country)-1, 'nrecipes']} recipes.")

Unnamed: 0,country_name,nrecipes
0,United States,28302
1,Mexico,6688
2,Italy,6532
3,Canada,4407
4,Australia,2839
5,British Indian Ocean Territory,2664
6,Greece,2338
7,France,2206
8,China,1792
9,Germany,1176


The country with the most recipes is "United States" with 28302 recipes.
The country with the least recipes is "Namibia" with 6 recipes.


In [None]:
nrecipes_by_region = recipe_countries_df['region'].value_counts().reset_index()
nrecipes_by_region.columns = ['region', 'nrecipes']
display(nrecipes_by_region.head(10))

print(f"The region with the most recipes is \"{nrecipes_by_region.loc[0, 'region']}\" with {nrecipes_by_region.loc[0, 'nrecipes']} recipes.")
print(f"The region with the least recipes is \"{nrecipes_by_region.loc[len(nrecipes_by_region)-1, 'region']}\" with {nrecipes_by_region.loc[len(nrecipes_by_region)-1, 'nrecipes']} recipes.")

Unnamed: 0,region,nrecipes
0,Americas,40321
1,Europe,16888
2,Asia,5400
3,Africa,3903
4,Oceania,2839


The region with the most recipes is "Americas" with 40321 recipes.
The region with the least recipes is "Oceania" with 2839 recipes.


In [None]:
nrecipes_by_region = recipe_countries_df['subregion'].value_counts().reset_index()
nrecipes_by_region.columns = ['subregion', 'nrecipes']
display(nrecipes_by_region.head(10))

print(f"The subregion with the most recipes is \"{nrecipes_by_region.loc[0, 'subregion']}\" with {nrecipes_by_region.loc[0, 'nrecipes']} recipes.")
print(f"The subregion with the least recipes is \"{nrecipes_by_region.loc[len(nrecipes_by_region)-1, 'subregion']}\" with {nrecipes_by_region.loc[len(nrecipes_by_region)-1, 'nrecipes']} recipes.")

Unnamed: 0,subregion,nrecipes
0,North America,39397
1,Southern Europe,10123
2,Western Europe,4068
3,Australia and New Zealand,2839
4,Eastern Africa,2749
5,Eastern Asia,2617
6,South-Eastern Asia,2025
7,Northern Europe,1482
8,Northern Africa,1093
9,Central Europe,907


The subregion with the most recipes is "North America" with 39397 recipes.
The subregion with the least recipes is "Southern Africa" with 6 recipes.


### Bivariate

#### Country vs Contributor

In [23]:
contributors_countries_df = recipe_countries_df[['id', 'contributor_id', 'country', 'country_name', 'region']]
unique_contributors_with_country = contributors_countries_df['contributor_id'].unique()
print(f"There are {recipes_df['contributor_id'].nunique()} unique contributors in total.")
print(f"There are {len(unique_contributors_with_country)} unique contributors with at least one recipe associated with a country.")

There are 27926 unique contributors in total.
There are 12025 unique contributors with at least one recipe associated with a country.


In [24]:
ncountries_by_contributor = contributors_countries_df.groupby('contributor_id')['country'].nunique().reset_index()
ncountries_by_contributor.columns = ['contributor_id', 'ncountries']
ncountries_by_contributor.sort_values(
    by='ncountries', ascending=False, inplace=True, ignore_index=True)
display(ncountries_by_contributor.head(10))

print("The contributor with the most diverse country contributions is contributor ID",
      ncountries_by_contributor.loc[0, 'contributor_id'],
      "with",
      ncountries_by_contributor.loc[0, 'ncountries'],
      "different countries.")

print("\nThe contributor with the least diverse country contributions is contributor ID",
      ncountries_by_contributor.loc[len(ncountries_by_contributor)-1, 'contributor_id'],
      "with",
      ncountries_by_contributor.loc[len(ncountries_by_contributor)-1, 'ncountries'],
      "different countries.")

Unnamed: 0,contributor_id,ncountries
0,37449,47
1,37636,35
2,169969,33
3,386585,33
4,163112,33
5,58104,33
6,107135,32
7,171790,32
8,204024,32
9,169430,32


The contributor with the most diverse country contributions is contributor ID 37449 with 47 different countries.

The contributor with the least diverse country contributions is contributor ID 2002234769 with 1 different countries.


In [25]:
ncontributors_by_country = contributors_countries_df.groupby('country_name')['contributor_id'].nunique().reset_index()
ncontributors_by_country.columns = ['country_name', 'ncontributors']
ncontributors_by_country.sort_values(
    by='ncontributors', ascending=False, inplace=True, ignore_index=True)
display(ncontributors_by_country.head(10))

print(f"The country with the most unique contributors is {ncontributors_by_country.loc[0, 'country_name']} with {ncontributors_by_country.loc[0, 'ncontributors']} contributors.")
print(f"\nThe country with the least unique contributors is {ncontributors_by_country.loc[len(ncontributors_by_country)-1, 'country_name']} with {ncontributors_by_country.loc[len(ncontributors_by_country)-1, 'ncontributors']} contributors.")

Unnamed: 0,country_name,ncontributors
0,United States,6760
1,Mexico,2764
2,Italy,2507
3,British Indian Ocean Territory,904
4,Greece,870
5,China,839
6,Canada,802
7,France,707
8,Thailand,630
9,Germany,523


The country with the most unique contributors is United States with 6760 contributors.

The country with the least unique contributors is Angola with 5 contributors.


In [26]:
nregions_by_contributor = contributors_countries_df.groupby('contributor_id')['region'].nunique().reset_index()
nregions_by_contributor.columns = ['contributor_id', 'nregions']
nregions_by_contributor.sort_values(
    by='nregions', ascending=False, inplace=True, ignore_index=True)
display(nregions_by_contributor.head(10))

print("The contributor with the most diverse region contributions is contributor ID",
      nregions_by_contributor.loc[0, 'contributor_id'],
      "with",
      nregions_by_contributor.loc[0, 'nregions'],
      "different regions.")

print("\nThe contributor with the least diverse region contributions is contributor ID",
      nregions_by_contributor.loc[len(nregions_by_contributor)-1, 'contributor_id'],
      "with",
      nregions_by_contributor.loc[len(nregions_by_contributor)-1, 'nregions'],
      "different regions.")

Unnamed: 0,contributor_id,nregions
0,386585,5
1,1058097,5
2,599450,5
3,610488,5
4,227978,5
5,431813,5
6,226863,5
7,1226388,5
8,67656,5
9,87023,5


The contributor with the most diverse region contributions is contributor ID 386585 with 5 different regions.

The contributor with the least diverse region contributions is contributor ID 2002234769 with 1 different regions.


In [27]:
ncontributors_by_region = contributors_countries_df.groupby('region')['contributor_id'].nunique().reset_index()
ncontributors_by_region.columns = ['region', 'ncontributors']
ncontributors_by_region.sort_values(
    by='ncontributors', ascending=False, inplace=True, ignore_index=True)
display(ncontributors_by_region.head(10))

print(f"The region with the most unique contributors is {ncontributors_by_region.loc[0, 'region']} with {ncontributors_by_region.loc[0, 'ncontributors']} contributors.")
print(f"\nThe region with the least unique contributors is {ncontributors_by_region.loc[len(ncontributors_by_region)-1, 'region']} with {ncontributors_by_region.loc[len(ncontributors_by_region)-1, 'ncontributors']} contributors.")

Unnamed: 0,region,ncontributors
0,Americas,8810
1,Europe,3992
2,Asia,1858
3,Africa,1173
4,Oceania,368


The region with the most unique contributors is Americas with 8810 contributors.

The region with the least unique contributors is Oceania with 368 contributors.
