# Objective:  It is to help people decide their grocery list based on their nutritional requirements or health conditions.

# Target Audience: Population residing in United States of America. 

# Structure of project:

1. Nutrient Profiling:
Develop a scoring system to rank foods based on their nutritional content, focusing on essential nutrients and health considerations. Allow users to set personalized nutritional goals (e.g., low sugar, high fiber) and generate a ranked list of foods that align with their preferences.

2. Health Condition Filters:
Implement filters for common health conditions prevalent in the US population (e.g., diabetes, heart conditions). Offer tailored grocery lists and meal suggestions for individuals with specific health conditions.

3. Meal Planning:
Incorporate meal planning features, allowing users to create balanced meals for specific times of the day (breakfast, lunch, dinner). Provide recipe suggestions based on nutritional requirements.

4. User Profiles:
Allow users to create profiles with their age, gender, dietary preferences, and health conditions. Use machine learning algorithms to improve recommendations based on user feedback and behavior.

5. Mobile Application:
Develop a mobile application for convenient access to grocery lists and nutritional information while shopping.

6. Education and Information:
Provide educational content on the importance of various nutrients, recommended daily allowances, and how certain foods contribute to overall health.

7. Language Support:
Ensure language support for various regions in India to make the app accessible to a broader audience.

8. Feedback Mechanism:
Implement a feedback system for users to rate the effectiveness of recommendations and suggest improvements.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 100)

In [2]:
nutrition = pd.read_csv("output_file.csv")

nutrition.head()

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),"Fatty acids, total saturated (g)","Fatty acids, total monounsaturated (g)","Fatty acids, total polyunsaturated (g)",Cholesterol (mg),Retinol (mcg),"Vitamin A, RAE (mcg_RAE)","Carotene, alpha (mcg)","Carotene, beta (mcg)","Cryptoxanthin, beta (mcg)",Lycopene (mcg),Lutein + zeaxanthin (mcg),Thiamin (mg),Riboflavin (mg),Niacin (mg),Vitamin B-6 (mg),Folic acid (mcg),"Folate, food (mcg)","Folate, DFE (mcg_DFE)","Folate, total (mcg)","Choline, total (mg)",Vitamin B-12 (mcg),"Vitamin B-12, added\n(mcg)",Vitamin C (mg),Vitamin D (D2 + D3) (mcg),Vitamin E (alpha-tocopherol) (mg),"Vitamin E, added\n(mg)",Vitamin K (phylloquinone) (mcg),Calcium (mg),Phosphorus (mg),Magnesium (mg),Iron\n(mg),Zinc\n(mg),Copper (mg),Selenium (mcg),Potassium (mg),Sodium (mg),Caffeine (mg),Theobromine (mg),Alcohol (g),4:0\n(g),6:0\n(g),8:0\n(g),10:0\n(g),12:0\n(g),14:0\n(g),16:0\n(g),18:0\n(g),16:1\n(g),18:1\n(g),20:1\n(g),22:1\n(g),18:2\n(g),18:3\n(g),18:4\n(g),20:4\n(g),20:5 n-3\n(g),22:5 n-3\n(g),22:6 n-3\n(g),Water\n(g)
0,11000000,"Milk, human",9602,Human milk,70,1.03,6.89,6.89,0.0,4.38,2.009,1.658,0.497,14,60,61,0,7,0,0,0,0.014,0.036,0.177,0.011,0,5,5,5,16.0,0.05,0.0,5.0,0.1,0.08,0.0,0.3,32,14,3,0.03,0.17,0.052,1.8,51,17,0,0,0.0,0.0,0.0,0.0,0.063,0.256,0.321,0.919,0.293,0.129,1.475,0.04,0.0,0.374,0.052,0.0,0.026,0.0,0.0,0.0,87.5
1,11100000,"Milk, NFS",1004,"Milk, reduced fat",52,3.33,4.83,4.88,0.0,2.14,1.249,0.458,0.07,9,57,58,0,4,0,0,0,0.057,0.137,0.11,0.061,0,1,1,1,17.9,0.56,0.0,0.1,1.1,0.03,0.0,0.2,125,103,12,0.0,0.43,0.001,1.9,156,39,0,0,0.0,0.046,0.036,0.023,0.056,0.065,0.204,0.576,0.208,0.032,0.465,0.002,0.0,0.074,0.008,0.0,0.003,0.0,0.001,0.0,88.92
2,11111000,"Milk, whole",1002,"Milk, whole",61,3.27,4.63,4.81,0.0,3.2,1.86,0.688,0.108,12,31,32,0,7,0,0,0,0.056,0.138,0.105,0.061,0,0,0,0,17.8,0.54,0.0,0.0,1.1,0.05,0.0,0.3,123,101,12,0.0,0.42,0.001,1.9,150,38,0,0,0.0,0.067,0.054,0.034,0.084,0.097,0.303,0.857,0.309,0.047,0.694,0.004,0.0,0.115,0.013,0.0,0.004,0.001,0.002,0.0,88.1
3,11112110,"Milk, reduced fat (2%)",1004,"Milk, reduced fat",50,3.36,4.9,4.89,0.0,1.9,1.11,0.4,0.058,8,83,83,0,3,0,0,0,0.059,0.137,0.112,0.061,0,2,2,2,18.2,0.55,0.0,0.2,1.1,0.03,0.0,0.2,126,103,12,0.0,0.43,0.001,1.8,159,39,0,0,0.0,0.041,0.032,0.021,0.049,0.058,0.181,0.512,0.184,0.029,0.41,0.002,0.0,0.061,0.007,0.0,0.003,0.0,0.001,0.0,89.1
4,11112210,"Milk, low fat (1%)",1006,"Milk, lowfat",43,3.38,5.18,4.96,0.0,0.95,0.568,0.21,0.032,5,58,58,0,1,0,0,0,0.057,0.14,0.113,0.06,0,2,2,2,17.4,0.61,0.0,0.0,1.1,0.02,0.0,0.1,126,103,12,0.0,0.43,0.001,2.1,159,39,0,0,0.0,0.022,0.015,0.011,0.023,0.026,0.093,0.265,0.096,0.014,0.214,0.001,0.0,0.033,0.004,0.0,0.001,0.0,0.0,0.0,89.7


In [3]:

print(nutrition.columns)


Index(['Food code', 'Main food description', 'WWEIA Category number',
       'WWEIA Category description', 'Energy (kcal)', 'Protein (g)',
       'Carbohydrate (g)', 'Sugars, total\n(g)', 'Fiber, total dietary (g)',
       'Total Fat (g)', 'Fatty acids, total saturated (g)',
       'Fatty acids, total monounsaturated (g)',
       'Fatty acids, total polyunsaturated (g)', 'Cholesterol (mg)',
       'Retinol (mcg)', 'Vitamin A, RAE (mcg_RAE)', 'Carotene, alpha (mcg)',
       'Carotene, beta (mcg)', 'Cryptoxanthin, beta (mcg)', 'Lycopene (mcg)',
       'Lutein + zeaxanthin (mcg)', 'Thiamin (mg)', 'Riboflavin (mg)',
       'Niacin (mg)', 'Vitamin B-6 (mg)', 'Folic acid (mcg)',
       'Folate, food (mcg)', 'Folate, DFE (mcg_DFE)', 'Folate, total (mcg)',
       'Choline, total (mg)', 'Vitamin B-12 (mcg)',
       'Vitamin B-12, added\n(mcg)', 'Vitamin C (mg)',
       'Vitamin D (D2 + D3) (mcg)', 'Vitamin E (alpha-tocopherol) (mg)',
       'Vitamin E, added\n(mg)', 'Vitamin K (phylloquinone) (

In [4]:
nutrition['WWEIA Category number'] = nutrition['WWEIA Category number'].astype(str)

In [5]:
# changing of column names
fatty_acids_mapping = {
    '4:0\n(g)': 'Butyric acid (g)',
    '6:0\n(g)': 'Caproic acid (g)',
    '8:0\n(g)': 'Caprylic acid (g)',
    '10:0\n(g)': 'Capric acid (g)',
    '12:0\n(g)': 'Lauric acid (g)',
    '14:0\n(g)': 'Myristic acid (g)',
    '16:0\n(g)': 'Palmitic acid (g)',
    '18:0\n(g)': 'Stearic acid (g)',
    '16:1\n(g)': 'Palmitoleic acid (g)',
    '18:1\n(g)': 'Oleic acid (g)',
    '20:1\n(g)': 'Gadoleic acid (g)',
    '22:1\n(g)': 'Erucic acid (g)',
    '18:2\n(g)': 'Linoleic acid (g)',
    '18:3\n(g)': 'Alpha-linolenic acid (g)',
    '18:4\n(g)': 'Stearidonic acid (g)',
    '20:4\n(g)': 'Arachidonic acid (g)',
    '20:5 n-3\n(g)': 'Eicosapentaenoic acid (EPA) (g)',
    '22:5 n-3\n(g)': 'Docosapentaenoic acid (DPA) (g)',
    '22:6 n-3\n(g)': 'Docosahexaenoic acid (DHA) (g)'
}

# Rename columns in-place
nutrition.columns = [fatty_acids_mapping.get(col, col) for col in nutrition.columns]


In [6]:
# converting all of them to same units. We are converting every unit to grams for all columns as it is standard unit. 

# Identify columns with 'mg' or 'mcg' in their names
mg_columns = [col for col in nutrition.columns if 'mg' in col]
mcg_columns = [col for col in nutrition.columns if 'mcg' in col]

# Convert columns to grams (1 mg = 0.001 g, 1 mcg = 0.000001 g)
nutrition[mg_columns] = nutrition[mg_columns] * 0.001
nutrition[mcg_columns] = nutrition[mcg_columns] * 0.000001


In [7]:
#checking the conversion of data
nutrition.head()

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),"Fatty acids, total saturated (g)","Fatty acids, total monounsaturated (g)","Fatty acids, total polyunsaturated (g)",Cholesterol (mg),Retinol (mcg),"Vitamin A, RAE (mcg_RAE)","Carotene, alpha (mcg)","Carotene, beta (mcg)","Cryptoxanthin, beta (mcg)",Lycopene (mcg),Lutein + zeaxanthin (mcg),Thiamin (mg),Riboflavin (mg),Niacin (mg),Vitamin B-6 (mg),Folic acid (mcg),"Folate, food (mcg)","Folate, DFE (mcg_DFE)","Folate, total (mcg)","Choline, total (mg)",Vitamin B-12 (mcg),"Vitamin B-12, added\n(mcg)",Vitamin C (mg),Vitamin D (D2 + D3) (mcg),Vitamin E (alpha-tocopherol) (mg),"Vitamin E, added\n(mg)",Vitamin K (phylloquinone) (mcg),Calcium (mg),Phosphorus (mg),Magnesium (mg),Iron\n(mg),Zinc\n(mg),Copper (mg),Selenium (mcg),Potassium (mg),Sodium (mg),Caffeine (mg),Theobromine (mg),Alcohol (g),Butyric acid (g),Caproic acid (g),Caprylic acid (g),Capric acid (g),Lauric acid (g),Myristic acid (g),Palmitic acid (g),Stearic acid (g),Palmitoleic acid (g),Oleic acid (g),Gadoleic acid (g),Erucic acid (g),Linoleic acid (g),Alpha-linolenic acid (g),Stearidonic acid (g),Arachidonic acid (g),Eicosapentaenoic acid (EPA) (g),Docosapentaenoic acid (DPA) (g),Docosahexaenoic acid (DHA) (g),Water\n(g)
0,11000000,"Milk, human",9602,Human milk,70,1.03,6.89,6.89,0.0,4.38,2.009,1.658,0.497,0.014,6e-05,6.1e-05,0.0,7e-06,0.0,0.0,0.0,1.4e-05,3.6e-05,0.000177,1.1e-05,0.0,5e-06,5e-06,5e-06,0.016,5e-08,0.0,0.005,1e-07,8e-05,0.0,3e-07,0.032,0.014,0.003,3e-05,0.00017,5.2e-05,2e-06,0.051,0.017,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.256,0.321,0.919,0.293,0.129,1.475,0.04,0.0,0.374,0.052,0.0,0.026,0.0,0.0,0.0,87.5
1,11100000,"Milk, NFS",1004,"Milk, reduced fat",52,3.33,4.83,4.88,0.0,2.14,1.249,0.458,0.07,0.009,5.7e-05,5.8e-05,0.0,4e-06,0.0,0.0,0.0,5.7e-05,0.000137,0.00011,6.1e-05,0.0,1e-06,1e-06,1e-06,0.0179,5.6e-07,0.0,0.0001,1.1e-06,3e-05,0.0,2e-07,0.125,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.156,0.039,0.0,0.0,0.0,0.046,0.036,0.023,0.056,0.065,0.204,0.576,0.208,0.032,0.465,0.002,0.0,0.074,0.008,0.0,0.003,0.0,0.001,0.0,88.92
2,11111000,"Milk, whole",1002,"Milk, whole",61,3.27,4.63,4.81,0.0,3.2,1.86,0.688,0.108,0.012,3.1e-05,3.2e-05,0.0,7e-06,0.0,0.0,0.0,5.6e-05,0.000138,0.000105,6.1e-05,0.0,0.0,0.0,0.0,0.0178,5.4e-07,0.0,0.0,1.1e-06,5e-05,0.0,3e-07,0.123,0.101,0.012,0.0,0.00042,1e-06,2e-06,0.15,0.038,0.0,0.0,0.0,0.067,0.054,0.034,0.084,0.097,0.303,0.857,0.309,0.047,0.694,0.004,0.0,0.115,0.013,0.0,0.004,0.001,0.002,0.0,88.1
3,11112110,"Milk, reduced fat (2%)",1004,"Milk, reduced fat",50,3.36,4.9,4.89,0.0,1.9,1.11,0.4,0.058,0.008,8.3e-05,8.3e-05,0.0,3e-06,0.0,0.0,0.0,5.9e-05,0.000137,0.000112,6.1e-05,0.0,2e-06,2e-06,2e-06,0.0182,5.5e-07,0.0,0.0002,1.1e-06,3e-05,0.0,2e-07,0.126,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.159,0.039,0.0,0.0,0.0,0.041,0.032,0.021,0.049,0.058,0.181,0.512,0.184,0.029,0.41,0.002,0.0,0.061,0.007,0.0,0.003,0.0,0.001,0.0,89.1
4,11112210,"Milk, low fat (1%)",1006,"Milk, lowfat",43,3.38,5.18,4.96,0.0,0.95,0.568,0.21,0.032,0.005,5.8e-05,5.8e-05,0.0,1e-06,0.0,0.0,0.0,5.7e-05,0.00014,0.000113,6e-05,0.0,2e-06,2e-06,2e-06,0.0174,6.1e-07,0.0,0.0,1.1e-06,2e-05,0.0,1e-07,0.126,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.159,0.039,0.0,0.0,0.0,0.022,0.015,0.011,0.023,0.026,0.093,0.265,0.096,0.014,0.214,0.001,0.0,0.033,0.004,0.0,0.001,0.0,0.0,0.0,89.7


In [8]:
# Define a function to remove units from column names
def remove_units(col):
    return col.split('(')[0].strip()

# Apply the remove_units function to all column names
nutrition.columns = [remove_units(col) for col in nutrition.columns]

In [9]:
nutrition.columns

Index(['Food code', 'Main food description', 'WWEIA Category number',
       'WWEIA Category description', 'Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid

In [10]:
# Check for NaN values in the entire DataFrame
nan_check_nutrition = nutrition.isna().sum()

# Display the columns with NaN values (if any)
columns_with_nan_nutrition = nan_check_nutrition[nan_check_nutrition > 0]
print("Columns with NaN values in the nutrition DataFrame:")
print(columns_with_nan_nutrition)


Columns with NaN values in the nutrition DataFrame:
Series([], dtype: int64)


In [11]:
# removing the food items whose food code or description is missing. That means the item doesn't exists.
nutrition_cleaned = nutrition.dropna(subset=["Food code", "Main food description"])


In [12]:
# Check for NaN values in the entire cleaned DataFrame
nan_check_nutrition_cleaned = nutrition_cleaned.isna().sum()

# Display the columns with NaN values (if any)
columns_with_nan_nutrition_cleaned = nan_check_nutrition_cleaned[nan_check_nutrition_cleaned > 0]
print("Columns with NaN values in the cleaned nutrition DataFrame:")
print(columns_with_nan_nutrition_cleaned)


Columns with NaN values in the cleaned nutrition DataFrame:
Series([], dtype: int64)


In [13]:
nutrition= nutrition_cleaned

In [14]:
nutrition.describe()

Unnamed: 0,Food code,Energy,Protein,Carbohydrate,"Sugars, total","Fiber, total dietary",Total Fat,"Fatty acids, total saturated","Fatty acids, total monounsaturated","Fatty acids, total polyunsaturated",Cholesterol,Retinol,"Vitamin A, RAE","Carotene, alpha","Carotene, beta","Cryptoxanthin, beta",Lycopene,Lutein + zeaxanthin,Thiamin,Riboflavin,Niacin,Vitamin B-6,Folic acid,"Folate, food","Folate, DFE","Folate, total","Choline, total",Vitamin B-12,"Vitamin B-12, added",Vitamin C,Vitamin D,Vitamin E,"Vitamin E, added",Vitamin K,Calcium,Phosphorus,Magnesium,Iron,Zinc,Copper,Selenium,Potassium,Sodium,Caffeine,Theobromine,Alcohol,Butyric acid,Caproic acid,Caprylic acid,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Stearic acid,Palmitoleic acid,Oleic acid,Gadoleic acid,Erucic acid,Linoleic acid,Alpha-linolenic acid,Stearidonic acid,Arachidonic acid,Eicosapentaenoic acid,Docosapentaenoic acid,Docosahexaenoic acid,Water
count,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0,5624.0
mean,50642200.0,197.013514,8.105884,20.547002,6.800884,1.726369,9.226767,2.833779,3.264583,2.251587,0.03402,4e-05,6.3e-05,4.5e-05,0.000249,6e-06,0.000288,0.000221,0.000158,0.000169,0.002365,0.00017,1.4e-05,2.1e-05,4.5e-05,3.5e-05,0.032855,4.762998e-07,5.283784e-08,0.005465,4.317568e-07,0.00115,0.000118,1.591691e-05,0.072253,0.126781,0.02778,0.001537,0.001103,0.00013,1.2e-05,0.210743,0.340061,0.003204,0.004478,0.205903,0.039454,0.026928,0.032332,0.052506,0.137338,0.226153,1.545343,0.669736,0.130413,3.069101,0.037238,0.0045,1.990056,0.206391,0.001121,0.013849,0.007483,0.003662,0.011531,60.295525
std,24140170.0,143.244099,7.766278,21.097697,12.08156,2.483058,11.661257,4.206053,4.862003,4.190374,0.080441,0.00018,0.000196,0.000267,0.000928,4.2e-05,0.001261,0.001004,0.000414,0.000312,0.003453,0.000255,5.5e-05,4.3e-05,0.000109,7.5e-05,0.045768,1.749874e-06,4.072588e-07,0.015207,1.409252e-06,0.003321,0.001551,6.018175e-05,0.117282,0.124647,0.040393,0.002864,0.002214,0.000331,2.8e-05,0.211374,0.340592,0.090411,0.038474,1.970841,0.134257,0.087426,0.158122,0.171991,0.915238,0.593345,2.038723,1.056966,0.27596,4.673998,0.103739,0.077104,3.766267,0.903565,0.008889,0.032375,0.059013,0.018862,0.087052,26.576561
min,11000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27410240.0,87.0,2.17,5.45,0.7,0.2,2.01,0.458,0.474,0.31575,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,4e-05,5.7e-05,0.000452,5.2e-05,0.0,5e-06,8e-06,7e-06,0.0095,0.0,0.0,0.0,0.0,0.00021,0.0,9e-07,0.014,0.047,0.012,0.00043,0.00032,4.4e-05,1e-06,0.111,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006,0.28275,0.085,0.008,0.43875,0.0,0.0,0.23775,0.028,0.0,0.0,0.0,0.0,0.0,46.085
50%,53710800.0,165.0,6.03,13.92,2.33,1.1,5.8,1.469,1.911,1.087,0.007,5e-06,1.7e-05,0.0,9e-06,0.0,0.0,1.8e-05,8.3e-05,0.000118,0.001413,0.000105,0.0,1.2e-05,2.2e-05,2e-05,0.0186,1.3e-07,0.0,0.0006,0.0,0.00054,0.0,3.8e-06,0.034,0.1,0.02,0.001,0.00067,7.6e-05,7e-06,0.177,0.31,0.0,0.0,0.0,0.0,0.0,0.001,0.004,0.005,0.0505,0.857,0.323,0.041,1.7645,0.014,0.0,0.943,0.094,0.0,0.002,0.0,0.0,0.0,67.56
75%,71405010.0,272.0,11.5125,26.04,6.99,2.2,12.94,3.76825,4.4285,2.543,0.04025,3.6e-05,5.9e-05,1e-06,7.9e-05,1e-06,0.0,9e-05,0.000195,0.000217,0.003307,0.000195,1.6e-05,2.5e-05,5.8e-05,4.5e-05,0.0392,4.6e-07,0.0,0.0049,3e-07,0.00119,0.0,1e-05,0.091,0.173,0.028,0.00183,0.00129,0.000123,1.9e-05,0.258,0.456,0.0,0.0,0.0,0.029,0.021,0.017,0.037,0.051,0.217,2.14025,0.834,0.153,4.03525,0.039,0.0,2.21775,0.2,0.0,0.01325,0.001,0.002,0.0,80.49
max,99998210.0,902.0,78.13,100.0,99.8,42.8,100.0,82.5,71.6,67.849,3.075,0.007667,0.007683,0.004655,0.014134,0.001922,0.045902,0.015643,0.023375,0.0175,0.1275,0.004271,0.002993,0.00234,0.005881,0.003786,0.8202,8.247e-05,8.3e-06,0.56,2.6e-05,0.1494,0.05192,0.00164,1.375,1.429,0.611,0.0641,0.09886,0.014472,0.001917,6.04,7.851,5.714,2.057,33.4,3.226,1.91,6.8,5.39,41.8,16.7,26.166,23.263,12.725,71.0,3.648,5.166,65.7,53.368,0.224,0.505,2.741,0.391,3.8,99.98


In [15]:
nutrition.columns

Index(['Food code', 'Main food description', 'WWEIA Category number',
       'WWEIA Category description', 'Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid

In [16]:
input_vars = ['Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid',
       'Palmitic acid', 'Stearic acid', 'Palmitoleic acid', 'Oleic acid',
       'Gadoleic acid', 'Erucic acid', 'Linoleic acid', 'Alpha-linolenic acid',
       'Stearidonic acid', 'Arachidonic acid', 'Eicosapentaenoic acid',
       'Docosapentaenoic acid', 'Docosahexaenoic acid', 'Water']

len(input_vars)

65

In [17]:
scaled_feats = ['Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid',
       'Palmitic acid', 'Stearic acid', 'Palmitoleic acid', 'Oleic acid',
       'Gadoleic acid', 'Erucic acid', 'Linoleic acid', 'Alpha-linolenic acid',
       'Stearidonic acid', 'Arachidonic acid', 'Eicosapentaenoic acid',
       'Docosapentaenoic acid', 'Docosahexaenoic acid', 'Water']

localdf = nutrition.loc[:, input_vars]

In [18]:
from sklearn.preprocessing import RobustScaler, StandardScaler

sc = StandardScaler()

rs = RobustScaler()

for i in scaled_feats:
    localdf.loc[:, i] = sc.fit_transform(pd.DataFrame(localdf.loc[:, i]))

In [19]:
nutrition.head(10)

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy,Protein,Carbohydrate,"Sugars, total","Fiber, total dietary",Total Fat,"Fatty acids, total saturated","Fatty acids, total monounsaturated","Fatty acids, total polyunsaturated",Cholesterol,Retinol,"Vitamin A, RAE","Carotene, alpha","Carotene, beta","Cryptoxanthin, beta",Lycopene,Lutein + zeaxanthin,Thiamin,Riboflavin,Niacin,Vitamin B-6,Folic acid,"Folate, food","Folate, DFE","Folate, total","Choline, total",Vitamin B-12,"Vitamin B-12, added",Vitamin C,Vitamin D,Vitamin E,"Vitamin E, added",Vitamin K,Calcium,Phosphorus,Magnesium,Iron,Zinc,Copper,Selenium,Potassium,Sodium,Caffeine,Theobromine,Alcohol,Butyric acid,Caproic acid,Caprylic acid,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Stearic acid,Palmitoleic acid,Oleic acid,Gadoleic acid,Erucic acid,Linoleic acid,Alpha-linolenic acid,Stearidonic acid,Arachidonic acid,Eicosapentaenoic acid,Docosapentaenoic acid,Docosahexaenoic acid,Water
0,11000000,"Milk, human",9602,Human milk,70,1.03,6.89,6.89,0.0,4.38,2.009,1.658,0.497,0.014,6e-05,6.1e-05,0.0,7e-06,0.0,0.0,0.0,1.4e-05,3.6e-05,0.000177,1.1e-05,0.0,5e-06,5e-06,5e-06,0.016,5e-08,0.0,0.005,1e-07,8e-05,0.0,3e-07,0.032,0.014,0.003,3e-05,0.00017,5.2e-05,2e-06,0.051,0.017,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.256,0.321,0.919,0.293,0.129,1.475,0.04,0.0,0.374,0.052,0.0,0.026,0.0,0.0,0.0,87.5
1,11100000,"Milk, NFS",1004,"Milk, reduced fat",52,3.33,4.83,4.88,0.0,2.14,1.249,0.458,0.07,0.009,5.7e-05,5.8e-05,0.0,4e-06,0.0,0.0,0.0,5.7e-05,0.000137,0.00011,6.1e-05,0.0,1e-06,1e-06,1e-06,0.0179,5.6e-07,0.0,0.0001,1.1e-06,3e-05,0.0,2e-07,0.125,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.156,0.039,0.0,0.0,0.0,0.046,0.036,0.023,0.056,0.065,0.204,0.576,0.208,0.032,0.465,0.002,0.0,0.074,0.008,0.0,0.003,0.0,0.001,0.0,88.92
2,11111000,"Milk, whole",1002,"Milk, whole",61,3.27,4.63,4.81,0.0,3.2,1.86,0.688,0.108,0.012,3.1e-05,3.2e-05,0.0,7e-06,0.0,0.0,0.0,5.6e-05,0.000138,0.000105,6.1e-05,0.0,0.0,0.0,0.0,0.0178,5.4e-07,0.0,0.0,1.1e-06,5e-05,0.0,3e-07,0.123,0.101,0.012,0.0,0.00042,1e-06,2e-06,0.15,0.038,0.0,0.0,0.0,0.067,0.054,0.034,0.084,0.097,0.303,0.857,0.309,0.047,0.694,0.004,0.0,0.115,0.013,0.0,0.004,0.001,0.002,0.0,88.1
3,11112110,"Milk, reduced fat (2%)",1004,"Milk, reduced fat",50,3.36,4.9,4.89,0.0,1.9,1.11,0.4,0.058,0.008,8.3e-05,8.3e-05,0.0,3e-06,0.0,0.0,0.0,5.9e-05,0.000137,0.000112,6.1e-05,0.0,2e-06,2e-06,2e-06,0.0182,5.5e-07,0.0,0.0002,1.1e-06,3e-05,0.0,2e-07,0.126,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.159,0.039,0.0,0.0,0.0,0.041,0.032,0.021,0.049,0.058,0.181,0.512,0.184,0.029,0.41,0.002,0.0,0.061,0.007,0.0,0.003,0.0,0.001,0.0,89.1
4,11112210,"Milk, low fat (1%)",1006,"Milk, lowfat",43,3.38,5.18,4.96,0.0,0.95,0.568,0.21,0.032,0.005,5.8e-05,5.8e-05,0.0,1e-06,0.0,0.0,0.0,5.7e-05,0.00014,0.000113,6e-05,0.0,2e-06,2e-06,2e-06,0.0174,6.1e-07,0.0,0.0,1.1e-06,2e-05,0.0,1e-07,0.126,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.159,0.039,0.0,0.0,0.0,0.022,0.015,0.011,0.023,0.026,0.093,0.265,0.096,0.014,0.214,0.001,0.0,0.033,0.004,0.0,0.001,0.0,0.0,0.0,89.7
5,11113000,"Milk, fat free (skim)",1008,"Milk, nonfat",34,3.43,4.92,5.05,0.0,0.08,0.049,0.017,0.006,0.003,6.4e-05,6.4e-05,0.0,2e-06,0.0,0.0,0.0,5.6e-05,0.000131,0.000118,5.8e-05,0.0,2e-06,2e-06,2e-06,0.0182,5.8e-07,0.0,0.0,1.1e-06,0.0,0.0,0.0,0.132,0.107,0.012,0.0,0.00045,2e-06,2e-06,0.167,0.041,0.0,0.0,0.0,0.003,0.001,0.001,0.002,0.002,0.008,0.021,0.009,0.001,0.018,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,90.8
6,11114300,"Milk, lactose free, low fat (1%)",1006,"Milk, lowfat",43,3.38,5.18,4.96,0.0,0.95,0.568,0.21,0.032,0.005,5.8e-05,5.8e-05,0.0,1e-06,0.0,0.0,0.0,5.7e-05,0.00014,0.000113,6e-05,0.0,2e-06,2e-06,2e-06,0.0174,6.1e-07,0.0,0.0,1.1e-06,2e-05,0.0,1e-07,0.126,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.159,0.039,0.0,0.0,0.0,0.022,0.015,0.011,0.023,0.026,0.093,0.265,0.096,0.014,0.214,0.001,0.0,0.033,0.004,0.0,0.001,0.0,0.0,0.0,89.7
7,11114320,"Milk, lactose free, fat free (skim)",1008,"Milk, nonfat",34,3.43,4.92,5.05,0.0,0.08,0.049,0.017,0.006,0.003,6.4e-05,6.4e-05,0.0,2e-06,0.0,0.0,0.0,5.6e-05,0.000131,0.000118,5.8e-05,0.0,2e-06,2e-06,2e-06,0.0182,5.8e-07,0.0,0.0,1.1e-06,0.0,0.0,0.0,0.132,0.107,0.012,0.0,0.00045,2e-06,2e-06,0.167,0.041,0.0,0.0,0.0,0.003,0.001,0.001,0.002,0.002,0.008,0.021,0.009,0.001,0.018,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,90.8
8,11114330,"Milk, lactose free, reduced fat (2%)",1004,"Milk, reduced fat",50,3.36,4.9,4.89,0.0,1.9,1.11,0.4,0.058,0.008,8.3e-05,8.3e-05,0.0,3e-06,0.0,0.0,0.0,5.9e-05,0.000137,0.000112,6.1e-05,0.0,2e-06,2e-06,2e-06,0.0182,5.5e-07,0.0,0.0002,1.1e-06,3e-05,0.0,2e-07,0.126,0.103,0.012,0.0,0.00043,1e-06,2e-06,0.159,0.039,0.0,0.0,0.0,0.041,0.032,0.021,0.049,0.058,0.181,0.512,0.184,0.029,0.41,0.002,0.0,0.061,0.007,0.0,0.003,0.0,0.001,0.0,89.1
9,11114350,"Milk, lactose free, whole",1002,"Milk, whole",61,3.27,4.63,4.81,0.0,3.2,1.86,0.688,0.108,0.012,3.1e-05,3.2e-05,0.0,7e-06,0.0,0.0,0.0,5.6e-05,0.000138,0.000105,6.1e-05,0.0,0.0,0.0,0.0,0.0178,5.4e-07,0.0,0.0,1.1e-06,5e-05,0.0,3e-07,0.123,0.101,0.012,0.0,0.00042,1e-06,2e-06,0.15,0.038,0.0,0.0,0.0,0.067,0.054,0.034,0.084,0.097,0.303,0.857,0.309,0.047,0.694,0.004,0.0,0.115,0.013,0.0,0.004,0.001,0.002,0.0,88.1


# Nutrient Profiling

1. Developing a scoring system to rank foods based on their nutritional content involves assigning weights to various nutrients. List essential nutrients that are important for health. This can include macronutrients (carbohydrates, proteins, fats) and micronutrients (vitamins, minerals).

2. Macronutrients are the nutrients we need in larger quantities that provide us with energy: in other words, fat, protein and carbohydrate. Micronutrients are mostly vitamins and minerals, and are equally important but consumed in very small amounts. (Reference: https://www.bhf.org.uk/informationsupport/heart-matters-magazine/nutrition/ask-the-expert/macronutrients#:~:text=Macronutrients%20are%20the%20nutrients%20we,consumed%20in%20very%20small%20amounts.)

# Categories: 
    
1. Saturated Fatty Acids:(MIN WEIGHT) Butyric acid
Caproic acid
Caprylic acid
Capric acid
Lauric acid
Myristic acid
Palmitic acid
Stearic acid

2. Monounsaturated Fatty Acids: (MODERATE WEIGHT)
Palmitoleic acid
Oleic acid
Gadoleic acid
Erucic acid

3. Polyunsaturated Fatty Acids: (MODERATE WEIGHT)
Linoleic acid
Alpha-linolenic acid
Stearidonic acid
Arachidonic acid
Eicosapentaenoic acid (EPA)
Docosapentaenoic acid
Docosahexaenoic acid (DHA)

4. Other: (NO WEIGHT)
Caffeine
Theobromine
Alcohol

5. Vitamins:(MODERATE WEIGHT)
Retinol
Vitamin A, RAE
Carotene, alpha
Carotene, beta
Cryptoxanthin, beta
Lycopene
Lutein + zeaxanthin
Thiamin
Riboflavin
Niacin
Vitamin B-6
Folic acid
Folate, food
Folate, DFE
Folate, total
Choline, total
Vitamin B-12
Vitamin B-12, added
Vitamin C
Vitamin D
Vitamin E
Vitamin E, added
Vitamin K

6. Minerals:(MODERATE WEIGHT)
Calcium
Phosphorus
Magnesium
Iron
Zinc
Copper
Selenium
Potassium
Sodium

7. Water (MAXIMUM WEIGHT)

8. Carbohydrates (MAXIMUM WEIGHT)

9. Protein (MAXIMUM WEIGHT)

10. Fiber, total dietary (MAXIMUM WEIGHT)

In [20]:
def calculate_nutritional_score(row, weights):
    # Nutrient categories
    saturated_fatty_acids = ['Butyric acid', 'Caproic acid', 'Caprylic acid', 'Capric acid',
                             'Lauric acid', 'Myristic acid', 'Palmitic acid', 'Stearic acid']

    monounsaturated_fatty_acids = ['Palmitoleic acid', 'Oleic acid', 'Gadoleic acid', 'Erucic acid']

    polyunsaturated_fatty_acids = ['Linoleic acid', 'Alpha-linolenic acid', 'Stearidonic acid',
                                   'Arachidonic acid', 'Eicosapentaenoic acid', 'Docosapentaenoic acid',
                                   'Docosahexaenoic acid']

    vitamins = ['Retinol', 'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta', 'Cryptoxanthin, beta',
                'Lycopene', 'Lutein + zeaxanthin', 'Thiamin', 'Riboflavin', 'Niacin', 'Vitamin B-6',
                'Folic acid', 'Folate, food', 'Folate, DFE', 'Folate, total', 'Choline, total',
                'Vitamin B-12', 'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
                'Vitamin E, added', 'Vitamin K']

    minerals = ['Calcium', 'Phosphorus', 'Magnesium', 'Iron', 'Zinc', 'Copper', 'Selenium',
                'Potassium', 'Sodium']

    # Initialize total score
    total_score = 0

    # Calculate score for each nutrient category
    for nutrient, weight in weights.items():
        if nutrient == 'Saturated Fatty Acids':
            total_score += sum(row[fatty_acid] for fatty_acid in saturated_fatty_acids) * weight
        elif nutrient == 'Monounsaturated Fatty Acids':
            total_score += sum(row[fatty_acid] for fatty_acid in monounsaturated_fatty_acids) * weight
        elif nutrient == 'Polyunsaturated Fatty Acids':
            total_score += sum(row[fatty_acid] for fatty_acid in polyunsaturated_fatty_acids) * weight
        elif nutrient == 'Vitamins':
            total_score += sum(row[vitamin] for vitamin in vitamins) * weight
        elif nutrient == 'Minerals':
            total_score += sum(row[mineral] for mineral in minerals) * weight
        else:
            total_score += row[nutrient] * weight

    return total_score / 100

def query_and_calculate_score(df, food_code_column, food_code, weights):
    selected_food = df[df[food_code_column] == food_code]

    if not selected_food.empty:
        selected_food = selected_food.iloc[0]
        nutrient_data = {  # Extract nutrient data from the selected row
            'Water': selected_food['Water'],
            'Carbohydrate': selected_food['Carbohydrate'],
            'Protein': selected_food['Protein'],
            'Fiber, total dietary': selected_food['Fiber, total dietary'],
            'Monounsaturated Fatty Acids': sum(selected_food[fatty_acid] for fatty_acid in monounsaturated_fatty_acids),
            'Polyunsaturated Fatty Acids': sum(selected_food[fatty_acid] for fatty_acid in polyunsaturated_fatty_acids),
            'Vitamins': sum(selected_food[vitamin] for vitamin in vitamins),
            'Minerals': sum(selected_food[mineral] for mineral in minerals),
            'Saturated Fatty Acids': sum(selected_food[fatty_acid] for fatty_acid in saturated_fatty_acids)
        }

        # Calculate the nutritional score
        score = calculate_nutritional_score(nutrient_data, weights)

        # Print the score
        print(f'Nutritional Score for {selected_food["Main food description"]}: {score}')
    else:
        print(f'Food with code {food_code} not found in the dataset.')

# Specify weights for each nutrient category
nutrient_weights = {
    'Water': 15,
    'Carbohydrate': 10,
    'Protein': 10,
    'Fiber, total dietary': 10,
    'Monounsaturated Fatty Acids': 10,
    'Polyunsaturated Fatty Acids': 10,
    'Vitamins': 5,
    'Minerals': 5,
    'Saturated Fatty Acids': 0
}

# Ask the user to input the food code
food_code_input = input("Enter Food Code: ")

# Calculate and print the nutritional score
query_and_calculate_score(nutrition, 'Food code', food_code_input, nutrient_weights)


Enter Food Code: 11112110
Food with code 11112110 not found in the dataset.


In [23]:
nutrition.columns


Index(['Food code', 'Main food description', 'WWEIA Category number',
       'WWEIA Category description', 'Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid

In [27]:
# def calculate_nutritional_score(row, weights, saturated_fatty_acids, monounsaturated_fatty_acids, polyunsaturated_fatty_acids, vitamins, minerals):
#     saturated_fatty_acids = ['Butyric acid', 'Caproic acid', 'Caprylic acid', 'Capric acid',
#                              'Lauric acid', 'Myristic acid', 'Palmitic acid', 'Stearic acid']

#     monounsaturated_fatty_acids = ['Palmitoleic acid', 'Oleic acid', 'Gadoleic acid', 'Erucic acid']

#     polyunsaturated_fatty_acids = ['Linoleic acid', 'Alpha-linolenic acid', 'Stearidonic acid',
#                                    'Arachidonic acid', 'Eicosapentaenoic acid', 'Docosapentaenoic acid',
#                                    'Docosahexaenoic acid']

#     vitamins = ['Retinol', 'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta', 'Cryptoxanthin, beta',
#                 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin', 'Riboflavin', 'Niacin', 'Vitamin B-6',
#                 'Folic acid', 'Folate, food', 'Folate, DFE', 'Folate, total', 'Choline, total',
#                 'Vitamin B-12', 'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
#                 'Vitamin E, added', 'Vitamin K']

#     minerals = ['Calcium', 'Phosphorus', 'Magnesium', 'Iron', 'Zinc', 'Copper', 'Selenium',
#                 'Potassium', 'Sodium']

def query_and_calculate_score(df, food_code_column, food_code, weights, saturated_fatty_acids, monounsaturated_fatty_acids, polyunsaturated_fatty_acids, vitamins, minerals):
    selected_food = df[df[food_code_column] == food_code]

    if not selected_food.empty:
        selected_food = selected_food.iloc[0]
        nutrient_data = {
            'Water': selected_food['Water'],
            'Carbohydrate': selected_food['Carbohydrate'],
            'Protein': selected_food['Protein'],
            'Fiber, total dietary': selected_food['Fiber, total dietary'],
            'Monounsaturated Fatty Acids': sum(selected_food[fatty_acid] for fatty_acid in monounsaturated_fatty_acids),
            'Polyunsaturated Fatty Acids': sum(selected_food[fatty_acid] for fatty_acid in polyunsaturated_fatty_acids),
            'Vitamins': sum(selected_food[vitamin] for vitamin in vitamins),
            'Minerals': sum(selected_food[mineral] for mineral in minerals),
            'Saturated Fatty Acids': sum(selected_food[fatty_acid] for fatty_acid in saturated_fatty_acids)
        }

        # Calculate the nutritional score
        score = calculate_nutritional_score(nutrient_data, weights, saturated_fatty_acids, monounsaturated_fatty_acids, polyunsaturated_fatty_acids, vitamins, minerals)

        # Print the score
        print(f'Nutritional Score for {selected_food["Main food description"]}: {score}')
    else:
        print(f'Food with code {food_code} not found in the dataset.')

# Specify weights for each nutrient category
nutrient_weights = {
    'Water': 15,
    'Carbohydrate': 10,
    'Protein': 10,
    'Fiber, total dietary': 10,
    'Monounsaturated Fatty Acids': 10,
    'Polyunsaturated Fatty Acids': 10,
    'Vitamins': 5,
    'Minerals': 5,
    'Saturated Fatty Acids': 0
}

# Specify nutrient categories
saturated_fatty_acids = ['Butyric acid', 'Caproic acid', 'Caprylic acid', 'Capric acid',
                         'Lauric acid', 'Myristic acid', 'Palmitic acid', 'Stearic acid']

monounsaturated_fatty_acids = ['Palmitoleic acid', 'Oleic acid', 'Gadoleic acid', 'Erucic acid']

polyunsaturated_fatty_acids = ['Linoleic acid', 'Alpha-linolenic acid', 'Stearidonic acid',
                               'Arachidonic acid', 'Eicosapentaenoic acid', 'Docosapentaenoic acid',
                               'Docosahexaenoic acid']

vitamins = ['Retinol', 'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta', 'Cryptoxanthin, beta',
            'Lycopene', 'Lutein + zeaxanthin', 'Thiamin', 'Riboflavin', 'Niacin', 'Vitamin B-6',
            'Folic acid', 'Folate, food', 'Folate, DFE', 'Folate, total', 'Choline, total',
            'Vitamin B-12', 'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
            'Vitamin E, added', 'Vitamin K']

minerals = ['Calcium', 'Phosphorus', 'Magnesium', 'Iron', 'Zinc', 'Copper', 'Selenium',
            'Potassium', 'Sodium']

# Ask the user to input the food code
food_code_input = input("Enter Food Code: ")

# Calculate and print the nutritional score
query_and_calculate_score(
    nutrition, 'Food code', food_code_input, nutrient_weights,
    saturated_fatty_acids, monounsaturated_fatty_acids, polyunsaturated_fatty_acids, vitamins, minerals
)

Enter Food Code: 11112110
Food with code 11112110 not found in the dataset.


In [22]:
# corr analysis before standardization of data

corr_matrix = nutrition.corr()

# Setting the threshold for correlation values to be displayed
threshold = 0.5

# Creating a mask to hide correlations below the threshold
mask = abs(corr_matrix) >= threshold

# Create a DataFrame containing only the highly correlated pairs
highly_correlated_pairs = corr_matrix[mask].stack().reset_index()
highly_correlated_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

# Filter out self-correlations and duplicates
highly_correlated_pairs = highly_correlated_pairs[highly_correlated_pairs['Variable 1'] != highly_correlated_pairs['Variable 2']]
highly_correlated_pairs = highly_correlated_pairs.drop_duplicates(subset='Correlation')

# Sort by correlation strength
highly_correlated_pairs = highly_correlated_pairs.sort_values(by='Correlation', ascending=False)

# Display the heatmap
plt.figure(figsize=(13, 11))
sns.heatmap(corr_matrix[mask], annot=True, cmap='YlGnBu')

plt.show()


ValueError: could not convert string to float: 'Milk, human'

Interpretation: 
The correlation values provide insights into the relationships between different nutritional variables for the 5624 food items. Here are some key interpretations:

Energy and Macronutrients:

Energy has a positive correlation with Carbohydrates (0.59), Total Fat (0.81), and various specific fatty acids (e.g., Palmitic acid, Oleic acid).
There is a strong negative correlation between Energy and Water (-0.94), indicating that foods with higher energy content tend to have lower water content.
Protein and Other Nutrients:

Protein has positive correlations with Niacin (0.51), Choline (0.52), and Phosphorus (0.71).
Carbohydrates show positive correlations with Sugars (0.65) and negative correlations with Water (-0.80).
Fats and Fatty Acids:

Total Fat has positive correlations with Fatty acids (saturated, monounsaturated, polyunsaturated) and specific fatty acids like Palmitic acid, Oleic acid, etc.
Fatty acids (saturated) have positive correlations with each other.
Vitamins:

Retinol (Vitamin A) shows strong positive correlations with Vitamin B-12 (0.70) and Copper (0.58).
There are positive correlations among different B-vitamins (Thiamin, Riboflavin, Niacin, Folic acid, B-6).
Minerals:

Calcium has a positive correlation with Phosphorus (0.56), and Phosphorus has positive correlations with Magnesium (0.59).
Others:

There are strong positive correlations between different fatty acids (e.g., Butyric acid, Caproic acid, Lauric acid, Palmitic acid).
There is a positive correlation between Butyric acid and Myristic acid (0.78), and a strong negative correlation between Butyric acid and Water (-0.51).
These correlations provide insights into how the presence of one nutrient may be associated with the presence of another in the food items.

In [None]:
from IPython.display import FileLink

# Save the highly_correlated_pairs DataFrame to a CSV file
highly_correlated_pairs.to_csv('highly_correlated_pairs_rawdata.csv', index=False)

# Create a download link
FileLink('highly_correlated_pairs_rawdata.csv')


In [None]:
# Mapping variables to nutrition categories
nutrition_categories = {
    'Energy': 'Energy',
    'Protein': 'Protein',
    'Carbohydrate': 'Carbohydrate',
    'Sugars, total': 'Carbohydrate',
    'Fiber, total dietary': 'Carbohydrate',
    'Total Fat': 'Total Fat',
    'Fatty acids, total saturated': 'Total Fat',
    'Fatty acids, total monounsaturated': 'Total Fat',
    'Fatty acids, total polyunsaturated': 'Total Fat',
    'Cholesterol': 'Cholesterol',
    'Retinol': 'Vitamins',
    'Vitamin A, RAE': 'Vitamins',
    'Carotene, alpha': 'Vitamins',
    'Carotene, beta': 'Vitamins',
    'Cryptoxanthin, beta': 'Vitamins',
    'Lycopene': 'Vitamins',
    'Lutein + zeaxanthin': 'Vitamins',
    'Thiamin': 'Vitamins',
    'Riboflavin': 'Vitamins',
    'Niacin': 'Vitamins',
    'Vitamin B-6': 'Vitamins',
    'Folic acid': 'Vitamins',
    'Folate, food': 'Vitamins',
    'Folate, DFE': 'Vitamins',
    'Folate, total': 'Vitamins',
    'Choline, total': 'Vitamins',
    'Vitamin B-12': 'Vitamins',
    'Vitamin B-12, added': 'Vitamins',
    'Vitamin C': 'Vitamins',
    'Vitamin D': 'Vitamins',
    'Vitamin E': 'Vitamins',
    'Vitamin E, added': 'Vitamins',
    'Vitamin K': 'Vitamins',
    'Calcium': 'Minerals',
    'Phosphorus': 'Minerals',
    'Magnesium': 'Minerals',
    'Iron': 'Minerals',
    'Zinc': 'Minerals',
    'Copper': 'Minerals',
    'Selenium': 'Minerals',
    'Potassium': 'Minerals',
    'Sodium': 'Minerals',
    'Caffeine': 'Other compounds',
    'Theobromine': 'Other compounds',
    'Alcohol': 'Other compounds',
    'Butyric acid': 'Fatty acids',
    'Caproic acid': 'Fatty acids',
    'Caprylic acid': 'Fatty acids',
    'Capric acid': 'Fatty acids',
    'Lauric acid': 'Fatty acids',
    'Myristic acid': 'Fatty acids',
    'Palmitic acid': 'Fatty acids',
    'Stearic acid': 'Fatty acids',
    'Palmitoleic acid': 'Fatty acids',
    'Oleic acid': 'Fatty acids',
    'Gadoleic acid': 'Fatty acids',
    'Erucic acid': 'Fatty acids',
    'Linoleic acid': 'Fatty acids',
    'Alpha-linolenic acid': 'Fatty acids',
    'Stearidonic acid': 'Fatty acids',
    'Arachidonic acid': 'Fatty acids',
    'Eicosapentaenoic acid': 'Fatty acids',
    'Docosapentaenoic acid': 'Fatty acids',
    'Docosahexaenoic acid': 'Fatty acids',
    'Water': 'Other compounds'
}

# Classify the highly correlated pairs
classified_pairs = highly_correlated_pairs.copy()
classified_pairs['Category 1'] = classified_pairs['Variable 1'].map(nutrition_categories)
classified_pairs['Category 2'] = classified_pairs['Variable 2'].map(nutrition_categories)

In [None]:
classified_pairs
# Save the file
classified_pairs.to_csv('classified_pairs_rawdata.csv', index=False)

Interpretation to be added later: 

In [None]:
input_vars = ['Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid',
       'Palmitic acid', 'Stearic acid', 'Palmitoleic acid', 'Oleic acid',
       'Gadoleic acid', 'Erucic acid', 'Linoleic acid', 'Alpha-linolenic acid',
       'Stearidonic acid', 'Arachidonic acid', 'Eicosapentaenoic acid',
       'Docosapentaenoic acid', 'Docosahexaenoic acid', 'Water']

len(input_vars)

In [None]:
scaled_feats = ['Energy', 'Protein', 'Carbohydrate',
       'Sugars, total', 'Fiber, total dietary', 'Total Fat',
       'Fatty acids, total saturated', 'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Cholesterol', 'Retinol',
       'Vitamin A, RAE', 'Carotene, alpha', 'Carotene, beta',
       'Cryptoxanthin, beta', 'Lycopene', 'Lutein + zeaxanthin', 'Thiamin',
       'Riboflavin', 'Niacin', 'Vitamin B-6', 'Folic acid', 'Folate, food',
       'Folate, DFE', 'Folate, total', 'Choline, total', 'Vitamin B-12',
       'Vitamin B-12, added', 'Vitamin C', 'Vitamin D', 'Vitamin E',
       'Vitamin E, added', 'Vitamin K', 'Calcium', 'Phosphorus', 'Magnesium',
       'Iron', 'Zinc', 'Copper', 'Selenium', 'Potassium', 'Sodium', 'Caffeine',
       'Theobromine', 'Alcohol', 'Butyric acid', 'Caproic acid',
       'Caprylic acid', 'Capric acid', 'Lauric acid', 'Myristic acid',
       'Palmitic acid', 'Stearic acid', 'Palmitoleic acid', 'Oleic acid',
       'Gadoleic acid', 'Erucic acid', 'Linoleic acid', 'Alpha-linolenic acid',
       'Stearidonic acid', 'Arachidonic acid', 'Eicosapentaenoic acid',
       'Docosapentaenoic acid', 'Docosahexaenoic acid', 'Water']

localdf = nutrition.loc[:, input_vars]

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler

sc = StandardScaler()

rs = RobustScaler()

for i in scaled_feats:
    localdf.loc[:, i] = sc.fit_transform(pd.DataFrame(localdf.loc[:, i]))

In [None]:
# corr analysis after standardization of data

corr_matrix = localdf.corr()

# Setting the threshold for correlation values to be displayed
threshold = 0.5

# Creating a mask to hide correlations below the threshold
mask = abs(corr_matrix) >= threshold

# Create a DataFrame containing only the highly correlated pairs
highly_correlated_pairs_scaled = corr_matrix[mask].stack().reset_index()
highly_correlated_pairs_scaled.columns = ['Variable 1', 'Variable 2', 'Correlation']

# Filter out self-correlations and duplicates
highly_correlated_pairs_scaled = highly_correlated_pairs_scaled[highly_correlated_pairs_scaled['Variable 1'] != highly_correlated_pairs_scaled['Variable 2']]
highly_correlated_pairs_scaled = highly_correlated_pairs_scaled.drop_duplicates(subset='Correlation')

# Sort by correlation strength
highly_correlated_pairs_scaled = highly_correlated_pairs_scaled.sort_values(by='Correlation', ascending=False)

# Display the heatmap
plt.figure(figsize=(13, 11))
sns.heatmap(corr_matrix[mask], annot=True, cmap='YlGnBu')

plt.show()

In [None]:
from IPython.display import FileLink

# Save the highly_correlated_pairs DataFrame to a CSV file
highly_correlated_pairs_scaled.to_csv('highly_correlated_pairs_scaleddata.csv', index=False)

# Create a download link
FileLink('highly_correlated_pairs_scaleddata.csv')


In [None]:
# Mapping variables to nutrition categories
nutrition_categories = {
    'Energy': 'Energy',
    'Protein': 'Protein',
    'Carbohydrate': 'Carbohydrate',
    'Sugars, total': 'Carbohydrate',
    'Fiber, total dietary': 'Carbohydrate',
    'Total Fat': 'Total Fat',
    'Fatty acids, total saturated': 'Total Fat',
    'Fatty acids, total monounsaturated': 'Total Fat',
    'Fatty acids, total polyunsaturated': 'Total Fat',
    'Cholesterol': 'Cholesterol',
    'Retinol': 'Vitamins',
    'Vitamin A, RAE': 'Vitamins',
    'Carotene, alpha': 'Vitamins',
    'Carotene, beta': 'Vitamins',
    'Cryptoxanthin, beta': 'Vitamins',
    'Lycopene': 'Vitamins',
    'Lutein + zeaxanthin': 'Vitamins',
    'Thiamin': 'Vitamins',
    'Riboflavin': 'Vitamins',
    'Niacin': 'Vitamins',
    'Vitamin B-6': 'Vitamins',
    'Folic acid': 'Vitamins',
    'Folate, food': 'Vitamins',
    'Folate, DFE': 'Vitamins',
    'Folate, total': 'Vitamins',
    'Choline, total': 'Vitamins',
    'Vitamin B-12': 'Vitamins',
    'Vitamin B-12, added': 'Vitamins',
    'Vitamin C': 'Vitamins',
    'Vitamin D': 'Vitamins',
    'Vitamin E': 'Vitamins',
    'Vitamin E, added': 'Vitamins',
    'Vitamin K': 'Vitamins',
    'Calcium': 'Minerals',
    'Phosphorus': 'Minerals',
    'Magnesium': 'Minerals',
    'Iron': 'Minerals',
    'Zinc': 'Minerals',
    'Copper': 'Minerals',
    'Selenium': 'Minerals',
    'Potassium': 'Minerals',
    'Sodium': 'Minerals',
    'Caffeine': 'Other compounds',
    'Theobromine': 'Other compounds',
    'Alcohol': 'Other compounds',
    'Butyric acid': 'Fatty acids',
    'Caproic acid': 'Fatty acids',
    'Caprylic acid': 'Fatty acids',
    'Capric acid': 'Fatty acids',
    'Lauric acid': 'Fatty acids',
    'Myristic acid': 'Fatty acids',
    'Palmitic acid': 'Fatty acids',
    'Stearic acid': 'Fatty acids',
    'Palmitoleic acid': 'Fatty acids',
    'Oleic acid': 'Fatty acids',
    'Gadoleic acid': 'Fatty acids',
    'Erucic acid': 'Fatty acids',
    'Linoleic acid': 'Fatty acids',
    'Alpha-linolenic acid': 'Fatty acids',
    'Stearidonic acid': 'Fatty acids',
    'Arachidonic acid': 'Fatty acids',
    'Eicosapentaenoic acid': 'Fatty acids',
    'Docosapentaenoic acid': 'Fatty acids',
    'Docosahexaenoic acid': 'Fatty acids',
    'Water': 'Other compounds'
}

# Classify the highly correlated pairs
classified_pairs_scaled = highly_correlated_pairs_scaled.copy()
classified_pairs_scaled['Category 1'] = classified_pairs_scaled['Variable 1'].map(nutrition_categories)
classified_pairs_scaled['Category 2'] = classified_pairs_scaled['Variable 2'].map(nutrition_categories)

Interpretation to be added later:

In [None]:
classified_pairs
# Save the file
classified_pairs_scaled.to_csv('classified_pairs_scaleddata.csv', index=False)

In [None]:
# Data ready for clustering

localdf = nutrition.loc[:, input_vars]

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

In [None]:
# Visualizer

kmeans = KMeans()
visualizer = KElbowVisualizer(estimator = kmeans, random_state = 123)

visualizer.fit(localdf)
visualizer.show()

# Use k = 5 to build the clusters...

In [None]:
kmeans = KMeans(n_clusters = 5, 
                random_state = 123)

# Fit the K Means and Generate the Labels and the Centroid

kmeans.fit(localdf)

localdf["cluster_labels"] = kmeans.labels_
nutrition["cluster_labels"] = kmeans.labels_


# Lets use the Cluster Centres to deduce inferences from the data

In [None]:
# Lets take cluster 0 and work on the important features.
nutrition.loc[nutrition.cluster_labels==0, ].describe()

Cluster 0:

Energy (mean): 149.06 - Low to moderate energy content.
Protein (mean): 9.44 - Moderate protein content.
Carbohydrate (mean): 12.81 - Moderate carbohydrate content.
Total Fat (mean): 6.56 - Moderate fat content.
Fiber, total dietary (mean): 1.26 - Moderate fiber content.
Sugars, total (mean): 2.87 - Low sugar content.
Cholesterol (mean): 0.05 - Very low cholesterol content.
Vitamin A, RAE (mean): 0.00 - Low vitamin A content.
Vitamin C (mean): 0.00 - Low vitamin C content.
Calcium (mean): 69.52 - Moderate calcium content.
Iron (mean): 0.00 - Low iron content.
Potassium (mean): 0.00 - Low to moderate potassium content.
Sodium (mean): 82.18 - High sodium content.
Water (mean): 0.00 - Low water content (possibly missing data).
Interpretation:

Foods in Cluster 0 have a nutritional profile characterized by moderate energy, protein, carbohydrate, and fat content. The fiber content is moderate, indicating a balanced combination of macronutrients. Additionally, foods in this cluster have low sugar and very low cholesterol content. The vitamin A and C content is low, while calcium and iron content are also low. The sodium content is relatively high, suggesting a savory or processed nature of the foods. The water content appears to be low, but it's marked as 0.00, which may indicate missing data or need for further investigation.

In summary, Cluster 0 represents foods with a balanced nutritional profile, particularly emphasizing moderate levels of energy, protein, carbohydrate, and fat. It includes items with low sugar and cholesterol, but high sodium content. 

In [None]:
# Lets take cluster 1 and work on the important features.
nutrition.loc[nutrition.cluster_labels==1, ].describe()

Cluster 1:

Energy (mean): 407.51 - High energy content.
Protein (mean): 9.57 - Moderate protein content.
Carbohydrate (mean): 56.87 - High carbohydrate content.
Total Fat (mean): 16.18 - High fat content.
Fiber, total dietary (mean): 3.28 - Moderate fiber content.
Sugars, total (mean): 22.23 - High sugar content.
Cholesterol (mean): 0.02 - Very low cholesterol content.
Vitamin A, RAE (mean): 0.00 - Low vitamin A content.
Vitamin C (mean): 0.00 - Low vitamin C content.
Calcium (mean): 14.88 - Low calcium content.
Iron (mean): 1.00 - Low to moderate iron content.
Potassium (mean): 24.50 - Low potassium content.
Sodium (mean): 1.00 - Very low sodium content.
Water (mean): Data not provided.
Interpretation:

Foods in Cluster 1 have a nutritional profile characterized by high energy, moderate protein, high carbohydrate, and high fat content. The fiber content is moderate, indicating a balanced combination of macronutrients. Additionally, foods in this cluster have high sugar content and very low cholesterol, vitamin A, and vitamin C content. The calcium and potassium content is low, and the sodium content is very low, suggesting a potential focus on processed or unprocessed foods. The water content data is not provided.

In summary, Cluster 1 represents foods with a substantial energy content and a balanced combination of macronutrients, especially high in carbohydrates and fats. 

In [None]:
# Lets take cluster 2 and work on the important features.
nutrition.loc[nutrition.cluster_labels==2, ].describe()

Cluster 2:

Energy (mean): 615.48 - High energy content.
Protein (mean): 11.05 - Moderate protein content.
Carbohydrate (mean): 27.30 - Moderate carbohydrate content.
Total Fat (mean): 53.31 - High fat content.
Fiber, total dietary (mean): 4.82 - Moderate fiber content.
Sugars, total (mean): 7.10 - Low sugar content.
Cholesterol (mean): 0.02 - Very low cholesterol content.
Vitamin A, RAE (mean): 0.00 - Low vitamin A content.
Vitamin C (mean): 0.00 - Low vitamin C content.
Calcium (mean): 5.69 - Very low calcium content.
Iron (mean): 2.00 - Low iron content.
Potassium (mean): Data not provided.
Sodium (mean): Data not provided.
Water (mean): Data not provided.
Interpretation:

Foods in Cluster 2 have a nutritional profile characterized by high energy, moderate protein, and high fat content. The carbohydrate content is moderate, and the fiber content is also moderate, indicating a balanced combination of macronutrients. Additionally, foods in this cluster have low sugar, very low cholesterol, and low to moderate levels of vitamins A and C. The calcium content is very low, and iron content is low. The potassium and sodium content data is not provided.

In summary, Cluster 2 represents foods with a substantial energy content, a balanced combination of macronutrients, and low sugar, cholesterol, vitamins A and C.

In [None]:
# Lets take cluster 3 and work on the important features.
nutrition.loc[nutrition.cluster_labels==3, ].describe()

Cluster 3:

Energy (mean): 54.31 - Low energy content.
Protein (mean): 2.40 - Low protein content.
Carbohydrate (mean): 8.13 - Low carbohydrate content.
Total Fat (mean): 1.40 - Low fat content.
Fiber, total dietary (mean): 1.08 - Low fiber content.
Sugars, total (mean): 4.37 - Low sugar content.
Cholesterol (mean): 0.00 - Very low cholesterol content.
Vitamin A, RAE (mean): 0.00 - Very low vitamin A content.
Vitamin C (mean): 0.00 - Very low vitamin C content.
Calcium (mean): 86.91 - High calcium content.
Iron (mean): 3.00 - Low iron content.
Potassium (mean): Data not provided.
Sodium (mean): Data not provided.
Water (mean): Data not provided.
Interpretation:

Foods in Cluster 3 have a nutritional profile characterized by low energy, low protein, low carbohydrate, low fat, and low fiber content. The sugar content is also low, and both cholesterol and vitamins A and C are very low. However, these foods have high calcium content. The interpretation should be considered with attention to any missing or anomalous data.

In summary, Cluster 3 represents foods with low energy and macronutrient content, very low sugar, cholesterol, vitamins A and C, but high calcium content. 

In [None]:
# Lets take cluster 4 and work on the important features.
nutrition.loc[nutrition.cluster_labels==4, ].describe()

Cluster 4:

Energy (mean): 260.55 - Moderate energy content.
Protein (mean): 12.13 - Moderate protein content.
Carbohydrate (mean): 24.85 - Moderate carbohydrate content.
Total Fat (mean): 12.40 - Moderate fat content.
Fiber, total dietary (mean): 1.84 - Low fiber content.
Sugars, total (mean): 6.33 - Moderate sugar content.
Cholesterol (mean): 0.05 - Low cholesterol content.
Vitamin A, RAE (mean): 0.00 - Very low vitamin A content.
Vitamin C (mean): 0.00 - Very low vitamin C content.
Calcium (mean): 48.27 - Low calcium content.
Iron (mean): 4.00 - Low iron content.
Potassium (mean): Data not provided.
Sodium (mean): Data not provided.
Water (mean): Data not provided.
Interpretation:

Foods in Cluster 4 have a nutritional profile characterized by moderate energy, protein, carbohydrate, and fat content. The sugar content is moderate, and both cholesterol and vitamins A and C are very low. However, these foods have low calcium content. The interpretation should be considered with attention to any missing or anomalous data.

In summary, Cluster 4 represents foods with a balanced macronutrient profile, moderate energy content, moderate sugar content, and low cholesterol and vitamins A and C. The potassium and sodium content data are not provided.

# Comparison of clusters

Cluster 0:

Profile: High in energy, moderate in protein, high in carbohydrate, low in fiber.
Interpretation: Foods in this cluster might include energy-dense items with a significant carbohydrate content. Depending on the specific types of carbohydrates and the presence of dietary fiber, these foods could be part of a balanced diet if consumed in moderation.
Cluster 1:

Profile: High in energy, moderate in protein, high in fat, low in fiber.
Interpretation: This cluster suggests foods with a higher fat content. Depending on the types of fats (saturated, unsaturated) and the overall dietary context, these foods could be part of a balanced diet if consumed in moderation.
Cluster 2:

Profile: High in energy, high in protein, moderate in fat, low in fiber.
Interpretation: Foods in this cluster may have a balanced macronutrient profile, with an emphasis on protein. The lack of fiber might be a consideration for overall dietary health.
Cluster 3:

Profile: Moderate energy, moderate protein, moderate carbohydrate, moderate fat, low in fiber.
Interpretation: This cluster represents foods with a balanced macronutrient profile. Depending on specific nutrient needs, these foods could be part of a healthy diet.
Cluster 4:

Profile: Moderate energy, moderate protein, moderate carbohydrate, moderate fat, low in fiber.
Interpretation: Foods in this cluster also have a balanced macronutrient profile. They may be considered moderate in terms of energy, protein, carbohydrate, and fat. The low levels of vitamins A and C, calcium, and iron suggest a potential need for supplementation or diversification in the diet.
Overall Considerations:

Diversity: It's essential to have a diverse diet that includes a variety of foods from different clusters to ensure a broad range of nutrients.
Moderation: No single cluster is inherently "good" or "bad." The key is moderation and balance. Consuming a variety of nutrient-dense foods in appropriate portions is crucial for overall health.
Individual Needs: Dietary recommendations vary based on individual health conditions, lifestyle, and specific nutritional needs. It's advisable to consult with a healthcare professional or a registered dietitian for personalized advice.

In [None]:
# Define a function to print main food descriptions for each cluster
def print_main_food_descriptions(cluster_data):
    for cluster_label in sorted(cluster_data['cluster_labels'].unique()):
        cluster_subset = cluster_data[cluster_data['cluster_labels'] == cluster_label]
        main_food_descriptions = cluster_subset['Main food description'].head(5).tolist()
        
        print(f"\nCluster {cluster_label} - Main Food Descriptions:")
        for i, food_description in enumerate(main_food_descriptions, start=1):
            print(f"{i}. {food_description}")


print_main_food_descriptions(nutrition)


In [None]:
# Count of items in each cluster
cluster_counts = nutrition['cluster_labels'].value_counts().sort_index()

# Print the counts
print("Cluster Counts:")
print(cluster_counts)


# Alignment of few food items with their cluster descriptions

Alignment Analysis:

Cluster 0: The selected items, especially the dairy products like milk and yogurt, align well with the characteristics of Cluster 0, which is high in energy, moderate in protein, high in carbohydrate, and low in fiber.

Cluster 1: Items like dry milk, whey, and chocolate beverage powder align with the high-energy and moderate-protein characteristics of Cluster 1.

Cluster 2: Coffee creamer, both regular and sugar-free, and reduced-sodium bacon align with the high-energy and moderate-protein characteristics of Cluster 2.

Cluster 3: The milk items, including human milk, align well with the moderate-energy and moderate-protein characteristics of Cluster 3.

Cluster 4: The items in this cluster, such as condensed milk, frozen yogurt, cocoa powder, and various cream products, align with the moderate-energy and moderate-protein characteristics of Cluster 4.

Overall, the alignment seems reasonable

# Which Cluster for diabetic population?

Diabetics often benefit from a diet that helps regulate blood sugar levels, including foods that are low in added sugars, rich in fiber, and moderate in carbohydrates. Let's consider the characteristics of the clusters:

Cluster 0: This cluster appears to contain dairy products like milk and yogurt, which are generally good sources of protein and may provide essential nutrients. However, it's important to consider the fat and carbohydrate content, especially if the products have added sugars.

Cluster 1: This cluster includes dry milk, whey, and chocolate beverage powder. Dry milk and whey can be sources of protein, but chocolate beverage powder may contain added sugars. Diabetics should be cautious with added sugars.

Cluster 2: This cluster includes coffee creamer and reduced-sodium bacon. Coffee creamer, especially flavored and sugar-free varieties, might be a better option for diabetics. Bacon should be consumed in moderation due to its high salt content.

Cluster 3: This cluster contains various types of milk. While milk is a good source of nutrients, individuals with diabetes should be mindful of the carbohydrate content. Choosing lower-fat options may be preferable.

Cluster 4: This cluster contains condensed milk, frozen yogurt, cocoa powder, and various cream products. These items may have high sugar and fat content, so they should be consumed in moderation by individuals with diabetes.

Recommendation:

Based on the general characteristics, Cluster 2 (coffee creamer, both regular and sugar-free) may be a better option for diabetics, as it's likely to be lower in added sugars.
Additionally, Cluster 3 (various types of milk) can be a good source of nutrients. Choosing lower-fat options may be beneficial for those concerned about calorie and carbohydrate intake.

In [None]:
# I don't agree with above cluster! we need to create new features

# Diabetic Food Platter

When designing a diabetic food platter, it's important to focus on nutrients that can help manage blood sugar levels. Here are some key features and considerations:

Carbohydrates:
Pay attention to the total carbohydrate content.
Choose complex carbohydrates with a lower glycemic index (GI), as they have a slower impact on blood sugar levels. Examples include whole grains, legumes, and vegetables.

Fiber:
Increase fiber intake, as it can help stabilize blood sugar levels.
Choose foods high in dietary fiber, such as fruits, vegetables, whole grains, and legumes.

Protein:
Include lean sources of protein to help maintain muscle mass and provide a steady release of energy.
Good protein sources include poultry, fish, tofu, legumes, and low-fat dairy.

Healthy Fats:
Opt for foods rich in monounsaturated and polyunsaturated fats, which can have positive effects on insulin sensitivity.
Sources of healthy fats include avocados, nuts, seeds, and olive oil.

Avoid or Limit:
Foods high in added sugars.
Highly processed and refined carbohydrates.
Saturated and trans fats.

Micronutrients:
Ensure an adequate intake of essential vitamins and minerals, including vitamin D, magnesium, and chromium, which play roles in insulin sensitivity.

Hydration:
Stay well-hydrated with water or other sugar-free beverages.
Portion Control:

Pay attention to portion sizes to manage calorie intake.
It's important to note that individual dietary needs can vary, and consulting with a healthcare professional or a registered dietitian can provide personalized guidance based on specific health conditions and goals.

Given your dataset, you can specifically look at columns such as 'Carbohydrate,' 'Fiber, total dietary,' 'Protein,' and 'Total Fat' when selecting foods for a diabetic-friendly platter. Additionally, considering the overall nutrient profile and avoiding foods high in added sugars and unhealthy fats is crucial.

In [None]:
# Glycemic Index will give a direct direction to blood glucose spiking items and we definitely do not want them. 
# We can lose those food items from 5624 food items and do clustering again. 

# Glycemic Index (GI) and Glycemic Load (GL)

The Glycemic Index (GI) is a measure of how quickly a carbohydrate-containing food raises blood glucose (blood sugar) levels after consumption. It is a scale that ranks carbohydrate-containing foods based on their effect on blood sugar levels compared to a reference food, usually pure glucose or white bread.

Here's a brief overview of how the Glycemic Index works:

High GI Foods:

Foods with a high GI are rapidly digested and absorbed, causing a quick and significant increase in blood glucose levels.
Examples include white bread, sugary cereals, and potatoes.
Medium GI Foods:

Foods with a medium GI are digested and absorbed at a moderate rate, leading to a moderate increase in blood glucose levels.
Examples include whole wheat products and some fruits.
Low GI Foods:

Foods with a low GI are digested and absorbed slowly, resulting in a slower and more gradual increase in blood glucose levels.
Examples include most fruits and vegetables, legumes, and whole grains.
The Glycemic Index is expressed as a numerical value, with glucose assigned a value of 100. Foods are then ranked relative to glucose. A food with a high GI is considered to cause a more rapid spike in blood sugar, while a food with a low GI is thought to have a slower, more sustained effect.

It's important to note that the Glycemic Index provides a general idea of how different foods affect blood sugar, but it may not account for variations in portion sizes or how foods are combined in a meal. The concept of Glycemic Load (GL), which takes into account both the GI of a food and the amount of carbohydrates in a serving, provides a more comprehensive picture of a food's impact on blood sugar.

In [None]:
# How to calculate GI

Calculating the Glycemic Index (GI) requires specific testing on human subjects to measure blood glucose responses to different foods. The process involves feeding participants a portion of a test food containing a standard amount of carbohydrates and then measuring their blood glucose levels over a specific period.

The GI is determined by comparing the blood glucose response to the test food with that of a reference food (usually glucose or white bread). The area under the blood glucose response curve for the test food is expressed as a percentage of the area under the curve for the reference food.

Since GI determination involves human subjects and specific testing conditions, it is not something that can be directly calculated from the nutritional content of a food item. It requires controlled experiments.

However, there is another related concept called the Glycemic Load (GL), which takes into account both the quality and quantity of carbohydrates in a food item. The Glycemic Load is calculated using the following formula:

GL=(GI×Carbohydrate content (g))/100

While GI is a property of the food itself, GL gives a more practical understanding of how a specific amount of a food affects blood sugar. Keep in mind that even GL is typically determined through testing, and published values for various foods are available in databases.

If you're interested in managing blood sugar levels, focusing on factors like carbohydrate quality, fiber content, and overall nutritional balance in your food choices can be beneficial. 

While you may not have the specific data needed for a comprehensive calculation of the Glycemic Index (GI), you can still make some general considerations for managing blood sugar levels based on the available information (Energy and Total Sugar/Carbohydrate).

Total Carbohydrate Content: Pay attention to the total carbohydrate content in foods. While you may not have detailed information on the types of carbohydrates (simple or complex), knowing the overall carbohydrate load is essential.

Fiber Content: If available, consider the fiber content. Fiber slows down the digestion and absorption of carbohydrates, which can help prevent rapid spikes in blood sugar levels. Foods higher in fiber are generally considered better for blood sugar control.

Protein and Fat: Including moderate amounts of protein and healthy fats in meals can also help manage blood sugar levels by slowing down the digestion and absorption of carbohydrates.

Portion Control: Even if a food has a higher carbohydrate content, managing portion sizes can help control the overall carbohydrate intake at a given meal.

Glycemic Load (Approximation): While you may not have the actual GI values, you can make a rough estimation of the Glycemic Load using the formula mentioned earlier: 

GL≈((Total Sugar/Carbohydrate content (g))/100 * Energy content))

This is a simplified approach and doesn't substitute for actual GI values obtained through testing. However, it provides a way to consider the impact of both the quality and quantity of carbohydrates on blood sugar.

# Feature 1: Glycemic Load 

In [None]:
# adding column Glycemic Load to df nutrition
nutrition['Glycemic Load'] = (nutrition['Carbohydrate'] / 100) * nutrition['Energy']


In [None]:
nutrition

In [None]:
gl_description = nutrition['Glycemic Load'].describe()

# Print the result
print(gl_description)


In [None]:
# Function to classify GL levels
def classify_gl(gl_value):
    if gl_value < low_gl_threshold:
        return 'Low'
    elif low_gl_threshold <= gl_value <= medium_gl_threshold:
        return 'Medium'
    else:
        return 'High'

# Apply the function to create a new column for interpretation
nutrition['GL_Interpretation'] = nutrition['Glycemic Load'].apply(classify_gl)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Glycemic Load', 'GL_Interpretation']].head())


# FEATURE 2: MUFA and PUFA categories

In [None]:
MUFA_description = nutrition['Fatty acids, total monounsaturated'].describe()

# Print the result
print(MUFA_description)


In [None]:
# Adjusted Thresholds
low_mufa_threshold = 0.474
medium_mufa_threshold = 4.4285
high_mufa_threshold = 10  

# Create a new column for MUFA category
nutrition['MUFA_Category'] = pd.cut(nutrition['Fatty acids, total monounsaturated'],
                                    bins=[-1, low_mufa_threshold, medium_mufa_threshold, float('inf')],
                                    labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Fatty acids, total monounsaturated', 'MUFA_Category']].head())


In [None]:
# now the same for PUFA

In [None]:
PUFA_description = nutrition['Fatty acids, total polyunsaturated'].describe()

# Print the result
print(PUFA_description)


In [None]:
# Adjusted Thresholds
low_pufa_threshold = 0.31575
medium_pufa_threshold = 2.543
high_pufa_threshold = 5 

# Create a new column for PUFA category
nutrition['PUFA_Category'] = pd.cut(nutrition['Fatty acids, total polyunsaturated'],
                                    bins=[-1, low_pufa_threshold, medium_pufa_threshold, float('inf')],
                                    labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Fatty acids, total polyunsaturated', 'PUFA_Category']].head())


In [None]:
nutrition

# FEATURE 3: PROTEIN (Albumin & Muscle Mass)

In [None]:
Protein_description = nutrition['Protein'].describe()

# Print the result
print(Protein_description)


In [None]:
# Adjusted Thresholds
low_protein_threshold = 2.17
medium_protein_threshold = 6.03
high_protein_threshold = 12  # You can adjust this threshold based on your criteria

# Create a new column for Protein category
nutrition['Protein_Category'] = pd.cut(nutrition['Protein'],
                                       bins=[-1, low_protein_threshold, medium_protein_threshold, float('inf')],
                                       labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Protein', 'Protein_Category']].head())


# FEATURE 4: Vitamin D (improves INS sensitivity)

In [None]:
VitaminD_description = nutrition['Vitamin D'].describe()

# Print the result
print(VitaminD_description)

In [None]:
# Adjusted Thresholds
low_vitaminD_threshold = 0
medium_vitaminD_threshold = 0.0000003
high_vitaminD_threshold = 0.0000027  # You can adjust this threshold based on your criteria

# Create a new column for Vitamin D category
nutrition['VitaminD_Category'] = pd.cut(nutrition['Vitamin D'],
                                        bins=[-1, low_vitaminD_threshold, medium_vitaminD_threshold, float('inf')],
                                        labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Vitamin D', 'VitaminD_Category']].head())


# FEATURE 5: Magnesium (improves INS sensitivity)

In [None]:
Mg_description = nutrition['Magnesium'].describe()

# Print the result
print(Mg_description)

In [None]:
# Adjusted Thresholds
low_magnesium_threshold = 0.012
medium_magnesium_threshold = 0.020
high_magnesium_threshold = 0.028

# Create a new column for Magnesium category
nutrition['Magnesium_Category'] = pd.cut(nutrition['Magnesium'],
                                         bins=[-1, low_magnesium_threshold, medium_magnesium_threshold, float('inf')],
                                         labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Magnesium', 'Magnesium_Category']].head())


# FEATURE 6: Zinc (Glycemic control and healthy lipid parameters)

In [None]:
Zn_description = nutrition['Zinc'].describe()

# Print the result
print(Zn_description)

In [None]:
# Adjusted Thresholds
low_zinc_threshold = 0.000320
medium_zinc_threshold = 0.000670
high_zinc_threshold = 0.001290

# Create a new column for Zinc category
nutrition['Zinc_Category'] = pd.cut(nutrition['Zinc'],
                                    bins=[-1, low_zinc_threshold, medium_zinc_threshold, float('inf')],
                                    labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Zinc', 'Zinc_Category']].head())


# FEATURE 7: Folate, total (decreases FBG/ improves IR)

In [None]:
Folate_description = nutrition['Folate, total'].describe()

# Print the result
print(Folate_description)

In [None]:
# Adjusted Thresholds
low_folate_threshold = 0.000007
medium_folate_threshold = 0.000020
high_folate_threshold = 0.000045

# Create a new column for Folate category
nutrition['Folate_Category'] = pd.cut(nutrition['Folate, total'],
                                      bins=[-1, low_folate_threshold, medium_folate_threshold, float('inf')],
                                      labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Folate, total', 'Folate_Category']].head())


# FEATURE 8: Energy (calorie intake)

In [None]:
Energy_description = nutrition['Energy'].describe()

# Print the result
print(Energy_description)

In [None]:
# Adjusted Thresholds
low_energy_threshold = 87
medium_energy_threshold = 165
high_energy_threshold = 272

# Create a new column for Energy category
nutrition['Energy_Category'] = pd.cut(nutrition['Energy'],
                                      bins=[-1, low_energy_threshold, medium_energy_threshold, float('inf')],
                                      labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Energy', 'Energy_Category']].head())


# FEATURE 9: Fiber, total dietary (improves blood glucose control)

In [None]:
Fiber_description = nutrition['Fiber, total dietary'].describe()

# Print the result
print(Fiber_description)

In [None]:
# Adjusted Thresholds
low_fiber_threshold = 0.2
medium_fiber_threshold = 1.1
high_fiber_threshold = 2.2

# Create a new column for Fiber category
nutrition['Fiber_Category'] = pd.cut(nutrition['Fiber, total dietary'],
                                     bins=[-1, low_fiber_threshold, medium_fiber_threshold, float('inf')],
                                     labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Fiber, total dietary', 'Fiber_Category']].head())


# FEATURE 10: DHA (Docosahexaenoic acid) (helps with insulin sensitivity)

In [None]:
DHA_description = nutrition['Docosahexaenoic acid'].describe()

# Print the result
print(DHA_description)

In [None]:
# Adjusted Thresholds
low_dha_threshold = 0.0001
medium_dha_threshold = 0.001
high_dha_threshold = 0.01

# Create a new column for DHA category
nutrition['DHA_Category'] = pd.cut(nutrition['Docosahexaenoic acid'],
                                   bins=[-1, low_dha_threshold, medium_dha_threshold, float('inf')],
                                   labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Docosahexaenoic acid', 'DHA_Category']].head())


# FEATURE 11: Fatty acids, total saturated 

In [None]:
sat_fats_description = nutrition['Fatty acids, total saturated'].describe()

# Print the result
print(sat_fats_description)

In [None]:
# Adjusted Thresholds
low_sat_fats_threshold = 1.0
medium_sat_fats_threshold = 2.0
high_sat_fats_threshold = 3.0

# Create a new column for saturated fats category
nutrition['Saturated_Fats_Category'] = pd.cut(nutrition['Fatty acids, total saturated'],
                                              bins=[-1, low_sat_fats_threshold, medium_sat_fats_threshold, float('inf')],
                                              labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Fatty acids, total saturated', 'Saturated_Fats_Category']].head())


# FEATURE 12: Potassium (deficient in T2DM patients)

In [None]:
potassium_description = nutrition['Potassium'].describe()

# Print the result
print(potassium_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.111, 0.177, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Potassium_Category' based on the categorization
nutrition['Potassium_Category'] = pd.cut(nutrition['Potassium'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Potassium', 'Potassium_Category']].head())


# FEATURE 13: Vitamin B6 (deficient in T2DM patients)

In [None]:
Vitamin_B6_description = nutrition['Vitamin B-6'].describe()

# Print the result
print(Vitamin_B6_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.000052, 0.000105, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Vitamin_B6_Category' based on the categorization
nutrition['Vitamin_B6_Category'] = pd.cut(nutrition['Vitamin B-6'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Vitamin B-6', 'Vitamin_B6_Category']].head())


# FEATURE 14: Calcium (deficient in T2DM patients)

In [None]:
Calcium_description = nutrition['Calcium'].describe()

# Print the result
print(Calcium_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.014, 0.034, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Calcium_Category' based on the categorization
nutrition['Calcium_Category'] = pd.cut(nutrition['Calcium'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Calcium', 'Calcium_Category']].head())


# FEATURE 15: Retinol/Vitamin A (deficient in T2DM patients) (controls Oxidative Stress)

In [None]:
Retinol_description = nutrition['Retinol'].describe()

# Print the result
print(Retinol_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.000005, 0.000036, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Retinol_Category' based on the categorization
nutrition['Retinol_Category'] = pd.cut(nutrition['Retinol'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Retinol', 'Retinol_Category']].head())


# FEATURE 16: Vitamin C (deficient in T2DM patients) (controls Oxidative Stress)

In [None]:
VitaminC_description = nutrition['Vitamin C'].describe()

# Print the result
print(VitaminC_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.0006, 0.0049, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'VitaminC_Category' based on the categorization
nutrition['VitaminC_Category'] = pd.cut(nutrition['Vitamin C'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Vitamin C', 'VitaminC_Category']].head())


# FEATURE 16: Vitamin E (deficient in T2DM patients) (controls Oxidative Stress)

In [None]:
VitaminE_description = nutrition['Vitamin E'].describe()

# Print the result
print(VitaminE_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.00054, 0.00119, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'VitaminE_Category' based on the categorization
nutrition['VitaminE_Category'] = pd.cut(nutrition['Vitamin E'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Vitamin E', 'VitaminE_Category']].head())


# FEATURE 17: Carotene, alpha (controls Oxidative Stress) Antioxidant

In [None]:
Carotene_alpha_description = nutrition['Carotene, alpha'].describe()

# Print the result
print(Carotene_alpha_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.0000001, 0.000001, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Carotene_alpha_Category' based on the categorization
nutrition['Carotene_alpha_Category'] = pd.cut(nutrition['Carotene, alpha'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Carotene, alpha', 'Carotene_alpha_Category']].head())


# FEATURE 18: Carotene, beta (controls Oxidative Stress) Antioxidant

In [None]:
Carotene_beta_description = nutrition['Carotene, beta'].describe()

# Print the result
print(Carotene_beta_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.00001, 0.0001, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Carotene_beta_Category' based on the categorization
nutrition['Carotene_beta_Category'] = pd.cut(nutrition['Carotene, beta'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Carotene, beta', 'Carotene_beta_Category']].head())


# FEATURE 19: Folic acid (controls fasting blood glucose) 

In [None]:
Folic_description = nutrition['Folic acid'].describe()

# Print the result
print(Folic_description)

In [None]:
# Define the bins and labels for categorization
bins = [0, 0.000001, 0.00001, float('inf')]
labels = ['Low', 'Medium', 'High']

# Create a new column 'Folic_Category' based on the categorization
nutrition['Folic_Category'] = pd.cut(nutrition['Folic acid'], bins=bins, labels=labels)

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Folic acid', 'Folic_Category']].head())

# FEATURE 20: Cholesterol 

In [None]:
Cholesterol_description = nutrition['Cholesterol'].describe()

# Print the result
print(Cholesterol_description)

In [None]:
# Create a new column for Cholesterol classification
nutrition['Cholesterol_Category'] = pd.cut(nutrition['Cholesterol'],
                                           bins=[-1, 0.007, 0.04025, float('inf')],
                                           labels=['Low', 'Moderate', 'High'])

# Display the first few rows of the DataFrame to verify the changes
print(nutrition[['Cholesterol', 'Cholesterol_Category']].head())


In [None]:
nutrition.info()

In [None]:
nutrition

In [None]:
nutrition.columns
