In [1]:
import pandas as pd
import requests
import json
import math

# Custom User-Agent header
headers = {'User-Agent': "Dash Application German Food - Version 1.0 - http://127.0.0.1:8050/"}

# List of categories to fetch
categories = ['snacks']

# Base URL for API requests
base_url = "https://world.openfoodfacts.org/category/{}.json"
fields = "code,product_name,brands,brands_tags,quantity,serving_size,categories,categories_tags,countries,countries_tags,origins,origins_tags,ingredients_text,no_nutrition_data,energy-kj_100g,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,vitamin-d_100g,vitamin-c_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,vitamin-b12_100g,pantothenic-acid_100g,potassium_100g,calcium_100g,iron_100g,caffeine_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,nutriscore_score,nutriscore_grade,nova_group"

# Number of results per page
page_size = 100

# Desired number of total results
total_results = 3000

# Calculate the total number of pages needed
total_pages = math.ceil(total_results / page_size)

# Function to fetch data for a specific category and page
def fetch_data_for_page(category, page):
    url = f"{base_url.format(category)}?fields={fields}&page_size={page_size}&page={page}"
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data_json = json.loads(response.text)
        if 'products' in data_json and data_json['products']:
            data_products = data_json['products']
            df = pd.json_normalize(data_products, sep='_')
            df['category'] = category
            return df
        else:
            return None
    else:
        print(f"Failed to retrieve data for category: {category}, page: {page}, Status Code: {response.status_code}")
        return None

# Fetch data for the specified number of pages
dataframes = []

for category in categories:
    for page in range(1, total_pages + 1):
        df = fetch_data_for_page(category, page)
        if df is not None:
            dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
if dataframes:
    final_df = pd.concat(dataframes, ignore_index=True)
    
    # Reorder columns to make 'category' the first column
    final_df = final_df[['category'] + [col for col in final_df.columns if col != 'category']]

    # Print the final DataFrame shape and data types
    print(final_df.shape)
    print(final_df.dtypes)
    print(final_df.head())

else:
    print("No data retrieved for the specified pages.")



(3000, 43)
category                              object
alcohol_100g                         float64
brands                                object
brands_tags                           object
carbohydrates_100g                   float64
categories                            object
categories_tags                       object
code                                  object
countries                             object
countries_tags                        object
energy-kcal_100g                     float64
energy-kj_100g                       float64
fat_100g                             float64
fiber_100g                           float64
ingredients_text                      object
no_nutrition_data                     object
nova_group                           float64
nutriscore_grade                      object
nutriscore_score                      object
origins                               object
origins_tags                          object
product_name                          object

In [2]:
final_df.to_csv("~/food_data2.csv", index=False)

In [3]:

# Data cleaning 1   --------------------------------------------

# Load the data
file_path = '~/food_data2.csv'
final_df = pd.read_csv(file_path)

# Initial number of rows
initial_rows = final_df.shape[0]
current_rows = initial_rows

print(f"Initial number of rows: {initial_rows}")

# Step 0.5: Convert code column to string
final_df['code'] = final_df['code'].astype(str)

# Step 0.6: Remove duplicates
final_df = final_df.drop_duplicates()
rows_removed_step_06 = current_rows - final_df.shape[0]
current_rows = final_df.shape[0]
print(f"After removing duplicates: {rows_removed_step_06} rows removed")

# Step 1: Encode the nutriscore grade to ordinal values and add it to the dataframe
nutriscore_mapping = {
    'a': 1,
    'b': 2,
    'c': 3,
    'd': 4,
    'e': 5
}
final_df['nutriscore_grade_encoded'] = final_df['nutriscore_grade'].map(nutriscore_mapping)

# Step 2: Remove every row that does not have a nutriscore or is unknown or not-applicable
initial_rows_step_2 = final_df.shape[0]
final_df = final_df[final_df['nutriscore_grade'].isin(nutriscore_mapping.keys())]
rows_removed_step_2 = initial_rows_step_2 - final_df.shape[0]
current_rows = final_df.shape[0]
print(f"After removing rows without a valid nutriscore: {rows_removed_step_2} rows removed")

# Step 3: Remove every row that does not have an energy-kcal_100g score
initial_rows_step_3 = final_df.shape[0]
final_df = final_df.dropna(subset=['energy-kcal_100g'])
rows_removed_step_3 = initial_rows_step_3 - final_df.shape[0]
current_rows = final_df.shape[0]
print(f"After removing rows without energy-kcal_100g: {rows_removed_step_3} rows removed")

# Final number of rows
final_rows = final_df.shape[0]

# Calculate and print the number of rows removed and the percentage
total_rows_removed = initial_rows - final_rows
percentage_removed = (total_rows_removed / initial_rows) * 100

print(f"Initial number of rows: {initial_rows}")
print(f"Final number of rows: {final_rows}")
print(f"Total number of rows removed: {total_rows_removed}")
print(f"Percentage of rows removed: {percentage_removed:.2f}%")

# Display the first few rows of the cleaned dataframe
final_df_cleaned = final_df
print(final_df_cleaned.head())


Initial number of rows: 3000
After removing duplicates: 0 rows removed
After removing rows without a valid nutriscore: 85 rows removed
After removing rows without energy-kcal_100g: 28 rows removed
Initial number of rows: 3000
Final number of rows: 2887
Total number of rows removed: 113
Percentage of rows removed: 3.77%
  category  alcohol_100g                brands               brands_tags  \
0   snacks           0.0           Lu,Mondelez        ['lu', 'mondelez']   
1   snacks           NaN                Gerblé                ['gerble']   
2   snacks           NaN  Chocolaterie Monbana  ['chocolaterie-monbana']   
3   snacks           NaN                 Lindt                 ['lindt']   
4   snacks           NaN                 Lindt                 ['lindt']   

   carbohydrates_100g                                         categories  \
0                69.0    Snacks,Sweet snacks,Biscuits and cakes,Biscuits   
1                64.0  Snacks, Snacks sucrés, Biscuits et gâteaux, Bi.

In [4]:
final_df.to_csv("~/food_data3_cleaned.csv", index=False)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Load the dataset
file_path = '~/food_data3_cleaned.csv'
data = pd.read_csv(file_path)

# Select relevant columns for model training
selected_columns = [
    'energy-kj_100g', 'energy-kcal_100g', 'fat_100g', 'saturated-fat_100g',
    'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 
    'salt_100g', 'sodium_100g', 'nutriscore_grade'
]

data_selected = data[selected_columns]

# Convert 'nutriscore_grade' to a numerical format for model training
nutriscore_mapping = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
data_selected['nutriscore_grade_encoded'] = data_selected['nutriscore_grade'].map(nutriscore_mapping)

# Drop the original 'nutriscore_grade' column
data_selected = data_selected.drop('nutriscore_grade', axis=1)

# Handle missing values by filling with the mean of each column
data_selected.fillna(data_selected.mean(), inplace=True)

# Separate features and target variable
X = data_selected.drop('nutriscore_grade_encoded', axis=1)
y = data_selected['nutriscore_grade_encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_predictions = rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, rf_predictions)
conf_matrix = confusion_matrix(y_test, rf_predictions)
class_report = classification_report(y_test, rf_predictions)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Save the model and scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_selected['nutriscore_grade_encoded'] = data_selected['nutriscore_grade'].map(nutriscore_mapping)


Accuracy: 0.884083044982699
Confusion Matrix:
[[ 24   8   0   0   0]
 [  1  34   6   1   0]
 [  3   5  86  15   0]
 [  0   1  12 168   5]
 [  0   1   0   9 199]]
Classification Report:
              precision    recall  f1-score   support

           1       0.86      0.75      0.80        32
           2       0.69      0.81      0.75        42
           3       0.83      0.79      0.81       109
           4       0.87      0.90      0.89       186
           5       0.98      0.95      0.96       209

    accuracy                           0.88       578
   macro avg       0.84      0.84      0.84       578
weighted avg       0.89      0.88      0.88       578

