# Library & Load Dataset Nutrition

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load dataset
df = pd.read_csv(r'..\data\nutrition.csv')
df


Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8784,8784,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,125,3.5g,1.4g,62mg,54.00 mg,64.5 mg,4.00 mcg,...,3.50 g,1.353 g,1.554 g,0.244 g,62.00 mg,0.0 g,1.11 g,0.00 mg,0.00 mg,72.51 g
8785,8785,"Lamb, cooked, separable lean only, composite o...",100 g,206,8.9g,3.9g,109mg,50.00 mg,0,0.00 mcg,...,8.86 g,3.860 g,3.480 g,0.520 g,109.00 mg,0,1.60 g,0,0,59.95 g
8786,8786,"Lamb, raw, separable lean and fat, composite o...",100 g,277,23g,12g,78mg,39.00 mg,0,1.00 mcg,...,22.74 g,11.570 g,8.720 g,0.980 g,78.00 mg,0,0.92 g,0,0,59.80 g
8787,8787,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100 g,121,3g,1.1g,60mg,53.00 mg,64.2 mg,4.00 mcg,...,3.04 g,1.086 g,1.266 g,0.233 g,60.00 mg,0.0 g,1.10 g,0.00 mg,0.00 mg,73.43 g


# Data Cleaning 

## Ganti Isi Data Kolom yang ada string dipindahkan ke Kolom Header seperti g, mg, dll

In [3]:
# Fungsi untuk memisahkan angka dan satuan, lalu hanya menyimpan angka
def extract_number(value):
    if pd.isnull(value):  # Jika value kosong, kembalikan None
        return None, None
    # Regex untuk memisahkan angka dan satuan
    match = re.match(r"([\d\.]+)\s*([a-zA-Z]*)", str(value))
    if match:
        number = match.group(1)
        unit = match.group(2)
        return (float(number), unit) if number else (None, None)
    return (None, None)

# Mendeteksi kolom yang memerlukan pembersihan
columns_to_process = []
units_found = {}

# Menentukan kolom mana yang perlu diproses berdasarkan isinya
for col in df.columns:
    has_string = False
    unit_set = set()
    
    for value in df[col].dropna():
        num, unit = extract_number(value)
        if unit:  # Jika menemukan satuan, tandai kolom ini perlu diproses
            has_string = True
            unit_set.add(unit)
    
    # Jika kolom mengandung satuan, tambahkan ke daftar untuk diproses
    if has_string and len(unit_set) == 1:  # Pastikan hanya ada satu jenis satuan
        columns_to_process.append(col)
        units_found[col] = unit_set.pop()

# Memproses kolom yang terdeteksi
for col in columns_to_process:
    unit = units_found[col]
    # Bersihkan data dan ubah menjadi angka saja, pastikan penanganan None
    df[col] = df[col].apply(lambda x: extract_number(x)[0] if extract_number(x)[0] is not None else x)
    # Ubah nama kolom dengan menambahkan satuan
    df.rename(columns={col: f"{col} ({unit})"}, inplace=True)

# Menampilkan hasil akhir
print("Kolom yang diproses:", columns_to_process)
print("\n\n\nKolom Sekarang:",list(df.columns))
df

Kolom yang diproses: []



Kolom Sekarang: ['Unnamed: 0', 'name', 'serving_size (g)', 'calories', 'total_fat (g)', 'saturated_fat (g)', 'cholesterol (mg)', 'sodium (mg)', 'choline (mg)', 'folate (mcg)', 'folic_acid (mcg)', 'niacin (mg)', 'pantothenic_acid (mg)', 'riboflavin (mg)', 'thiamin (mg)', 'vitamin_a', 'vitamin_a_rae (mcg)', 'carotene_alpha (mcg)', 'carotene_beta (mcg)', 'cryptoxanthin_beta (mcg)', 'lutein_zeaxanthin (mcg)', 'lucopene', 'vitamin_b12 (mcg)', 'vitamin_b6 (mg)', 'vitamin_c (mg)', 'vitamin_d (IU)', 'vitamin_e (mg)', 'tocopherol_alpha (mg)', 'vitamin_k (mcg)', 'calcium (mg)', 'copper (mg)', 'irom (mg)', 'magnesium (mg)', 'manganese (mg)', 'phosphorous (mg)', 'potassium (mg)', 'selenium (mcg)', 'zink (mg)', 'protein (g)', 'alanine (g)', 'arginine (g)', 'aspartic_acid (g)', 'cystine (g)', 'glutamic_acid (g)', 'glycine (g)', 'histidine (g)', 'hydroxyproline (g)', 'isoleucine (g)', 'leucine (g)', 'lysine (g)', 'methionine (g)', 'phenylalanine (g)', 'proline (g)', 'serine

Unnamed: 0.1,Unnamed: 0,name,serving_size (g),calories,total_fat (g),saturated_fat (g),cholesterol (mg),sodium (mg),choline (mg),folate (mcg),...,fat (g),saturated_fatty_acids (g),monounsaturated_fatty_acids (g),polyunsaturated_fatty_acids (g),fatty_acids_total_trans (mg),alcohol (g),ash (g),caffeine (mg),theobromine (mg),water (g)
0,0,Cornstarch,100.0,381,0.1,,0.0,9.0,0.4,0.0,...,0.05,0.009,0.016,0.025,0.0,0.0,0.09,0.0,0.0,8.32
1,1,"Nuts, pecans",100.0,691,72.0,6.2,0.0,0.0,40.5,22.0,...,71.97,6.180,40.801,21.614,0.0,0.0,1.49,0.0,0.0,3.52
2,2,"Eggplant, raw",100.0,25,0.2,,0.0,2.0,6.9,22.0,...,0.18,0.034,0.016,0.076,0.0,0.0,0.66,0.0,0.0,92.30
3,3,"Teff, uncooked",100.0,367,2.4,0.4,0.0,12.0,13.1,0.0,...,2.38,0.449,0.589,1.071,0.0,0.0,2.37,0.0,0.0,8.82
4,4,"Sherbet, orange",100.0,144,2.0,1.2,1.0,46.0,7.7,4.0,...,2.00,1.160,0.530,0.080,1.0,0.0,0.40,0.0,0.0,66.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8784,8784,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100.0,125,3.5,1.4,62.0,54.0,64.5,4.0,...,3.50,1.353,1.554,0.244,62.0,0.0,1.11,0.0,0.0,72.51
8785,8785,"Lamb, cooked, separable lean only, composite o...",100.0,206,8.9,3.9,109.0,50.0,0.0,0.0,...,8.86,3.860,3.480,0.520,109.0,0.0,1.60,0.0,0.0,59.95
8786,8786,"Lamb, raw, separable lean and fat, composite o...",100.0,277,23.0,12.0,78.0,39.0,0.0,1.0,...,22.74,11.570,8.720,0.980,78.0,0.0,0.92,0.0,0.0,59.80
8787,8787,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",100.0,121,3.0,1.1,60.0,53.0,64.2,4.0,...,3.04,1.086,1.266,0.233,60.0,0.0,1.10,0.0,0.0,73.43


In [3]:
# Menghilangkan kolom yang tidak relevan (bisa disesuaikan)
columns_to_keep = ['name', 'calories', 'protein (g)', 'total_fat (g)', 'serving_size (g)']
df = df[columns_to_keep]
df

Unnamed: 0,name,calories,protein (g),total_fat (g),serving_size (g)
0,Cornstarch,381,0.26,0.1,100.0
1,"Nuts, pecans",691,9.17,72.0,100.0
2,"Eggplant, raw",25,0.98,0.2,100.0
3,"Teff, uncooked",367,13.30,2.4,100.0
4,"Sherbet, orange",144,1.10,2.0,100.0
...,...,...,...,...,...
8784,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",125,23.45,3.5,100.0
8785,"Lamb, cooked, separable lean only, composite o...",206,29.59,8.9,100.0
8786,"Lamb, raw, separable lean and fat, composite o...",277,16.74,23.0,100.0
8787,"Beef, raw, all grades, trimmed to 0"" fat, sepa...",121,23.37,3.0,100.0


In [17]:
# Mengecek missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 name                0
calories            0
protein (g)         0
total_fat (g)       0
serving_size (g)    0
dtype: int64


In [18]:
# Menambahkan kolom BMI (Body Mass Index)
def calculate_bmi(weight_kg, height_cm):
    height_m = height_cm / 100
    return weight_kg / (height_m ** 2)

# Contoh data pengguna (bisa diambil dari input)
user_weight = 70  # berat badan dalam kg
user_height = 170  # tinggi badan dalam cm

user_bmi = calculate_bmi(user_weight, user_height)
print(f"BMI Anda: {user_bmi:.2f}")

# Membuat kategori berdasarkan BMI
def diet_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Healthy'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

user_category = diet_category(user_bmi)
print(f"Kategori diet Anda: {user_category}")

BMI Anda: 24.22
Kategori diet Anda: Healthy


In [19]:
# Memisahkan fitur dan label
X = df[['calories', 'protein (g)', 'total_fat (g)']]
y = df['name']

# Normalisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Membangun model KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Evaluasi model
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.00
Classification Report:
                                                                                                                                      precision    recall  f1-score   support

                                                                                             ANDREA'S, Gluten Free Soft Dinner Roll       0.00      0.00      0.00       0.0
                                                                                               APPLEBEE'S, 9 oz house sirloin steak       0.00      0.00      0.00       0.0
                                                                                                   APPLEBEE'S, Double Crunch Shrimp       0.00      0.00      0.00       0.0
                                                                                                APPLEBEE'S, chicken tenders platter       0.00      0.00      0.00       0.0
                                                                                               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# 2. Drop kolom yang terlalu banyak missing values
threshold = 0.4  # Drop kolom jika lebih dari 40% data hilang
df = df.dropna(axis=1, thresh=int((1-threshold) * len(df)))

# 3. Mengisi missing values dengan median
df.fillna(df.median(), inplace=True)

# 4. Memilih fitur yang relevan untuk model
selected_features = [
    'calories', 'protein (g)', 'total_fat (g)', 'carbohydrate (g)', 'fiber (g)', 'sugars (g)',
    'serving_size'
]
df = df[selected_features]

# 5. Normalisasi fitur
scaler = StandardScaler()
df[selected_features] = scaler.fit_transform(df[selected_features])

print("\nDataset after preprocessing:\n", df.head())

TypeError: Cannot convert [['Cornstarch' 'Nuts, pecans' 'Eggplant, raw' ...
  'Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand'
  'Beef, raw, all grades, trimmed to 0" fat, separable lean only, boneless, eye of round roast, round'
  'Beef, raw, all grades, trimmed to 0" fat, separable lean only, boneless, eye of round steak, round']] to numeric