# Requirements Python
**Python Version: 3.9**

**Library Python**
| **Package**                  | **Version**  |
|------------------------------|--------------|
| matplotlib                   | 3.9.2        |
| numpy                        | 1.24.3       |
| pandas                       | 2.2.3        |
| pip                          | 24.3.1       |
| seaborn                      | 0.13.2       |
| scipy                        | 1.10.1       |
| nltk                         | 3.7          |
| wordcloud                    | 1.8.2        |
| scikit-learn                 | 1.2.2        |
| regex                        | 2023.9.0     |


# Library & Load Dataset Nutrition

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from wordcloud import WordCloud

# Download required NLTK data
nltk.download('punkt')

# Load dataset
df = pd.read_csv(r'..\data\combine-dataset.csv')
df

# Data Cleaning 

In [None]:
# Daftar batas maksimum nutrisi
max_Calories = 2650
max_daily_fat = 100
max_daily_Saturatedfat = 13
max_daily_Cholesterol = 300
max_daily_Sodium = 2300
max_daily_Carbohydrate = 325
max_daily_Fiber = 40
max_daily_Sugar = 40
max_daily_Protein = 200

max_list = [
    max_Calories,
    max_daily_fat,
    max_daily_Saturatedfat,
    max_daily_Cholesterol,
    max_daily_Sodium,
    max_daily_Carbohydrate,
    max_daily_Fiber,
    max_daily_Sugar,
    max_daily_Protein
]

# Pastikan dataset memiliki kolom nutrisi sesuai
nutritional_columns = [
    'Calories', 'Fat(g)', 'SaturatedFat(g)', 'Cholesterol(mg)', 'Sodium(mg)',
    'Carbohydrate(g)', 'Fiber(g)', 'Sugar(g)', 'Protein(g)'
]

# Filter dataset berdasarkan batas maksimum
df = df.copy()
for column, maximum in zip(nutritional_columns, max_list):
    if column in df.columns:
        df = df[df[column] < maximum]

In [None]:
# Membagi dataset menjadi setengah (50%)
df = df.sample(frac=0.5, random_state=42)  # 'frac=0.5' artinya 50% dari dataset

In [None]:
# Basic info and statistics of the dataset
df.info()

# Check for missing values
df.isnull().sum()

# Display basic statistics of the numeric columns
df.describe()

## Visualize Missing Values

### Check Data Missing Values

In [None]:
df_missing_values = df.isnull().sum()
df_missing_columns = df_missing_values[df_missing_values > 0]
print("Terdapat Data Yang Hilang Sebanyak ",df_missing_columns.count(), "Kolom, Dengan Nama Kolom : ")
print(df_missing_columns)
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

## Distribution of Key Nutrients (Histogram)

In [None]:
# List of key nutrient columns for distribution analysis
selected_columns = ['Calories', 'Protein(g)', 'Fat(g)', 'SaturatedFat(g)', 
                    'Cholesterol(mg)', 'Sodium(mg)', 'Carbohydrate(g)', 'Fiber(g)', 'Sugar(g)']

# Set up a larger grid to visualize these columns
rows = (len(selected_columns) // 4) + 1

plt.figure(figsize=(18, 4 * rows))

for i, col in enumerate(selected_columns, 1):
    plt.subplot(rows, 4, i)
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()

plt.show()

## Boxplot to Detect Outliers

In [None]:
# Boxplot to detect outliers
plt.figure(figsize=(18, 4 * rows))

for i, col in enumerate(selected_columns, 1):
    plt.subplot(rows, 4, i)
    sns.boxplot(x=df[col].dropna())
    plt.title(f"Boxplot of {col}")
    plt.xlabel(col)
    plt.tight_layout()

plt.show()


## Correlation Heatmap

In [None]:
plt.figure(figsize=(14, 10))
correlation_matrix = df[selected_columns].corr()

# Generate heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap of Nutritional Values")
plt.show()

## Comparison of Macronutrients

In [None]:
macronutrients = ['Carbohydrate(g)', 'Protein(g)', 'Fat(g)']

plt.figure(figsize=(10, 6))
sns.boxplot(data=df[macronutrients])
plt.title("Comparison of Macronutrient Content")
plt.ylabel("Grams per Serving")
plt.show()

## Top High-Protein Foods

In [None]:
# Sort by protein content and plot top 10 foods
top_protein = df[['Name', 'Protein(g)']].sort_values(by='Protein(g)', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Protein(g)', y='Name', data=top_protein, palette='viridis')
plt.title("Top 10 Foods with Highest Protein Content")
plt.xlabel("Protein(g)")
plt.ylabel("Food Name")
plt.show()

## Top Low-Calorie Foods

In [None]:
# Sort by calories and plot top 10 lowest-calorie foods
low_calories = df[['Name', 'Calories']].sort_values(by='Calories', ascending=True).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Calories', y='Name', data=low_calories, palette='Blues')
plt.title("Top 10 Foods with Lowest Calories")
plt.xlabel("Calories")
plt.ylabel("Food Name")
plt.show()

## Visualize distribution of calories

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Calories'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of Calories')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

## Top 10 foods with the highest calories

In [None]:
top_calories = df[['Name', 'Calories']].sort_values(by='Calories', ascending=False).head(10)
print("\nTop 10 Foods with the Highest Calories:")
print(top_calories)

## Top 10 foods with the highest protein content

In [None]:
top_protein = df[['Name', 'Protein(g)']].sort_values(by='Protein(g)', ascending=False).head(10)
print("\nTop 10 Foods with the Highest Protein Content:")
print(top_protein)

## Visualizing nutrient content

In [None]:
nutrients = ['Fat(g)', 'SaturatedFat(g)', 'Cholesterol(mg)', 'Sodium(mg)', 'Carbohydrate(g)', 'Protein(g)']
df[nutrients].hist(figsize=(15, 10), bins=20)
plt.suptitle('Nutrient Content Distribution', fontsize=20)
plt.show()

## Pairplot to analyze relationships between various nutrients

In [None]:
sns.pairplot(df[['Calories', 'Protein(g)', 'Fat(g)', 'Carbohydrate(g)', 'Fiber(g)']])
plt.show()

## Scatter plot of calories vs protein content

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['Calories'], df['Protein(g)'], alpha=0.6, color='purple')
plt.title('Calories vs. Protein Content')
plt.xlabel('Calories')
plt.ylabel('Protein(g)')
plt.grid(True)
plt.show()

## Stacked Bar Chart: Proportion of Nutrients in Recipes

In [None]:
# Select key columns for macronutrients analysis
macronutrient_cols = ['Name', 'Carbohydrate(g)', 'Protein(g)', 'Fat(g)']

# Filter dataset to include only relevant columns
df_macronutrients = df[macronutrient_cols].dropna()

# Normalize the macronutrient values to show proportions
df_macronutrients['total'] = df_macronutrients[['Carbohydrate(g)', 'Protein(g)', 'Fat(g)']].sum(axis=1)
df_macronutrients['carbohydrate (%)'] = (df_macronutrients['Carbohydrate(g)'] / df_macronutrients['total']) * 100
df_macronutrients['protein (%)'] = (df_macronutrients['Protein(g)'] / df_macronutrients['total']) * 100
df_macronutrients['fat (%)'] = (df_macronutrients['Fat(g)'] / df_macronutrients['total']) * 100

# Sort by total content and take top 10 for visualization
top_10 = df_macronutrients.sort_values(by='total', ascending=False).head(100)

# Plotting the stacked bar chart
plt.figure(figsize=(12, 8))
plt.bar(top_10['Name'], top_10['carbohydrate (%)'], label='Carbohydrates', color='skyblue')
plt.bar(top_10['Name'], top_10['protein (%)'], bottom=top_10['carbohydrate (%)'], label='Protein', color='lightgreen')
plt.bar(top_10['Name'], top_10['fat (%)'], 
        bottom=top_10['carbohydrate (%)'] + top_10['protein (%)'], label='Fat', color='salmon')

plt.title("Proportion of Macronutrients in Top 100 Foods")
plt.xlabel("Food Items")
plt.ylabel("Proportion (%)")
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

## Average Nutrient Composition Across All Recipes

In [None]:
# Calculate average proportion of macronutrients across the entire dataset
avg_carbs = df_macronutrients['Carbohydrate(g)'].sum()
avg_protein = df_macronutrients['Protein(g)'].sum()
avg_fat = df_macronutrients['Fat(g)'].sum()
total = avg_carbs + avg_protein + avg_fat

# Calculate the percentage of each nutrient
macronutrient_distribution = {
    'Carbohydrates': (avg_carbs / total) * 100,
    'Protein': (avg_protein / total) * 100,
    'Fat': (avg_fat / total) * 100
}

# Pie chart visualization
plt.figure(figsize=(8, 8))
plt.pie(macronutrient_distribution.values(), labels=macronutrient_distribution.keys(), 
        autopct='%1.1f%%', colors=['skyblue', 'lightgreen', 'salmon'])
plt.title("Average Nutrient Composition Across All Recipes")
plt.show()

## WordCloud for food names

In [None]:
# Pastikan semua nilai dalam kolom 'Name' adalah string
df['Name'] = df['Name'].fillna('').astype(str)

# Gabungkan semua nama makanan menjadi satu string
text = " ".join(df['Name'])

# Buat WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Plot WordCloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Food Names')
plt.show()

In [None]:
# 1. Hitung frekuensi nama makanan
food_counts = df['Name'].value_counts()

# 2. Ambil top N nama makanan yang paling sering muncul (misalnya, top 100)
top_food_names = food_counts.head(100)

# 3. Gabungkan nama makanan menjadi teks untuk WordCloud tanpa memperhitungkan frekuensi
text = " ".join(top_food_names.index)

# 4. Generate WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# 5. Visualisasikan WordCloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud of Unique Trending Food Names')
plt.show()
print(top_food_names.head(100))

In [None]:
# import pandas as pd
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# # Contoh daftar kata yang dianggap makanan atau bahan makanan (bisa ditambah atau diimpor dari file)
# food_keywords = [
#     # Buah-buahan
#     "Apple", "Banana", "Orange", "Mango", "Strawberry", "Pineapple", "Watermelon", "Papaya", 
#     "Grape", "Blueberry", "Blackberry", "Raspberry", "Cherry", "Kiwi", "Lemon", "Lime", 
#     "Peach", "Pear", "Plum", "Pomegranate", "Avocado", "Coconut", "Guava", "Fig", "Dates", 
#     "Lychee", "Dragonfruit", "Persimmon", "Mulberry", "Cranberry", "Cantaloupe", "Honeydew", 
#     "Starfruit", "Durian", "Jackfruit", "Passionfruit", "Rambutan", "Longan", "Soursop", 
#     "Gooseberry", "Mangosteen", "Sugar Apple", "Salak", "Elderberry", "Quince", "Tamarind", 
#     "Sapodilla", "Bael", "Breadfruit", "Ackee", "Jujube", "Medlar", "Currant", "Acerola", 
#     "Loquat", "Boysenberry",

#     # Sayuran
#     "Carrot", "Potato", "Tomato", "Onion", "Garlic", "Spinach", "Lettuce", "Kale", "Broccoli", 
#     "Cauliflower", "Cucumber", "Zucchini", "Eggplant", "Bell Pepper", "Chili Pepper", "Beet", 
#     "Radish", "Turnip", "Sweet Potato", "Pumpkin", "Squash", "Celery", "Asparagus", "Green Bean", 
#     "Pea", "Okra", "Cabbage", "Bok Choy", "Brussels Sprouts", "Artichoke", "Leek", "Fennel", 
#     "Shallot", "Ginger", "Turmeric", "Collard Greens", "Mustard Greens", "Swiss Chard", 
#     "Watercress", "Arugula", "Taro", "Yam", "Bamboo Shoot", "Corn", "Parsley", "Cilantro", 
#     "Basil", "Thyme", "Rosemary", "Oregano", "Mint", "Chives", "Dill",

#     # Hewan Ternak
#     "Chicken", "Duck", "Turkey", "Goose", "Quail", "Rabbit", "Cow", "Pig", "Sheep", "Goat", 
#     "Buffalo", "Camel", "Yak", "Horse", "Deer", "Elk", "Moose", "Reindeer", "Kangaroo", 
#     "Ostrich", "Emu", "Pheasant", "Partridge", "Dove", "Pigeon", "Crab", "Lobster", "Shrimp", 
#     "Fish", "Tuna", "Salmon", "Cod", "Tilapia", "Catfish", "Sardine", "Anchovy", "Mackerel", 
#     "Trout", "Bass", "Eel", "Squid", "Octopus", "Clam", "Oyster", "Mussel", "Scallop"
# ]


# # Filter nama makanan yang relevan
# def filter_food_names(names, keywords):
#     filtered = []
#     for name in names:
#         words = name.lower().split()  # Split nama menjadi kata-kata kecil
#         if any(word in keywords for word in words):
#             filtered.append(name)
#     return filtered

# # 1. Hitung frekuensi nama makanan
# food_counts = df['Name'].value_counts()

# # 2. Ambil top N nama makanan yang paling sering muncul (misalnya, top 100)
# top_food_names = food_counts.head(100)

# # Pastikan kolom 'Name' bersih
# top_food_names['Name'] = df['Name'].fillna('').astype(str)

# # Ambil nama makanan unik
# unique_food_names = df['Name'].unique()

# # Filter hanya nama yang berisi kata dari food_keywords
# filtered_names = filter_food_names(unique_food_names, food_keywords)

# # Gabungkan nama makanan yang sudah difilter untuk WordCloud
# text = " ".join(filtered_names)

# # Buat WordCloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# # Visualisasikan WordCloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('WordCloud of Filtered Food Names')
# plt.show()


In [28]:
food_keywords = [
    # Buah-buahan
    "Apple", "Banana", "Orange", "Mango", "Strawberry", "Pineapple", "Watermelon", "Papaya", 
    "Grape", "Blueberry", "Blackberry", "Raspberry", "Cherry", "Kiwi", "Lemon", "Lime", 
    "Peach", "Pear", "Plum", "Pomegranate", "Avocado", "Coconut", "Guava", "Fig", "Dates", 
    "Lychee", "Dragonfruit", "Persimmon", "Mulberry", "Cranberry", "Cantaloupe", "Honeydew", 
    "Starfruit", "Durian", "Jackfruit", "Passionfruit", "Rambutan", "Longan", "Soursop", 
    "Gooseberry", "Mangosteen", "Sugar Apple", "Salak", "Elderberry", "Quince", "Tamarind", 
    "Sapodilla", "Bael", "Breadfruit", "Ackee", "Jujube", "Medlar", "Currant", "Acerola", 
    "Loquat", "Boysenberry", "Ugli Fruit",

    # Sayuran
    "Carrot", "Potato", "Tomato", "Onion", "Garlic", "Spinach", "Lettuce", "Kale", "Broccoli", 
    "Cauliflower", "Cucumber", "Zucchini", "Eggplant", "Bell Pepper", "Chili Pepper", "Beet", 
    "Radish", "Turnip", "Sweet Potato", "Pumpkin", "Squash", "Celery", "Asparagus", "Green Bean", 
    "Pea", "Okra", "Cabbage", "Bok Choy", "Brussels Sprouts", "Artichoke", "Leek", "Fennel", 
    "Shallot", "Ginger", "Turmeric", "Collard Greens", "Mustard Greens", "Swiss Chard", 
    "Watercress", "Arugula", "Taro", "Yam", "Bamboo Shoot", "Corn", "Parsley", "Cilantro", 
    "Basil", "Thyme", "Rosemary", "Oregano", "Mint", "Chives", "Dill", "Parsley", "Cilantro", 
    "Basil", "Thyme","Rosemary", "Sage", "Oregano", "Tarragon", "Marjoram", "Chives", "Curry Leaf", 
    "Bay Leaf",

    # Hewan Ternak
    "Chicken", "Duck", "Turkey", "Goose", "Quail", "Rabbit", "Cow", "Pig", "Sheep", "Goat", 
    "Buffalo", "Camel", "Yak", "Horse", "Deer", "Elk", "Moose", "Reindeer", "Kangaroo", 
    "Ostrich", "Emu", "Pheasant", "Partridge", "Dove", "Pigeon", "Crab", "Lobster", "Shrimp", 
    "Fish", "Tuna", "Salmon", "Cod", "Tilapia", "Catfish", "Sardine", "Anchovy", "Mackerel", 
    "Trout", "Bass", "Eel", "Squid", "Octopus", "Clam", "Oyster", "Mussel", "Scallop", "Sea Urchin", 
    "Cuttlefish", "Lobster","Hare", "Guinea Pig", "Llama", "Alpaca", "Bison", "Crocodile", "Alligator",
    "Wild Boar","Pheasant", "Grouse", "Peafowl", "Capon", "Silkie Chicken", "Snake", "Turtle", 
    "Pangolin", "Armadillo"
]
# # Filter baris di mana kolom 'Name' mengandung '&laquo;'
# result = df['Name'].str.contains('Banana', na=False)
# filtered_df = df[result]
# filtered_df

# Menyimpan hasil jumlah entri yang cocok
match_counts = {}

# Loop melalui setiap kata kunci
for keyword in food_keywords:
    # Filter baris di mana kolom 'Name' mengandung keyword
    result = df['Name'].str.contains(keyword, na=False, case=False)
    filtered_df = df[result]
    
    # Simpan jumlah entri yang cocok untuk setiap kata kunci
    match_counts[keyword] = len(filtered_df)

# Tampilkan hasil jumlah yang cocok
for keyword, count in match_counts.items():
    if count > 0:  # Menampilkan hanya kata kunci dengan hasil cocok
        print(f"{keyword}: {count} entries")

# Jika ingin menyimpan hasil cocok ke DataFrame baru
matched_names_df = df[df['Name'].apply(lambda x: any(k in x for k in food_keywords))]
print("\nFiltered DataFrame with Matches:")
print(matched_names_df)

Apple: 5464 entries
Banana: 2909 entries
Orange: 2551 entries
Mango: 823 entries
Strawberry: 1716 entries
Pineapple: 1276 entries
Watermelon: 255 entries
Papaya: 104 entries
Grape: 441 entries
Blueberry: 1221 entries
Blackberry: 186 entries
Raspberry: 973 entries
Cherry: 1310 entries
Kiwi: 133 entries
Lemon: 4102 entries
Lime: 1200 entries
Peach: 1107 entries
Pear: 794 entries
Plum: 242 entries
Pomegranate: 206 entries
Avocado: 1056 entries
Coconut: 1245 entries
Guava: 36 entries
Fig: 273 entries
Dates: 69 entries
Lychee: 30 entries
Dragonfruit: 1 entries
Persimmon: 41 entries
Mulberry: 1 entries
Cranberry: 1593 entries
Cantaloupe: 73 entries
Honeydew: 37 entries
Starfruit: 2 entries
Durian: 2 entries
Jackfruit: 5 entries
Passionfruit: 30 entries
Rambutan: 1 entries
Longan: 1 entries
Soursop: 3 entries
Gooseberry: 24 entries
Sugar Apple: 6 entries
Elderberry: 6 entries
Quince: 11 entries
Tamarind: 55 entries
Breadfruit: 9 entries
Jujube: 2 entries
Currant: 95 entries
Acerola: 1 entries