#### Import Required Libraries

Import the necessary libraries for data preprocessing, model training, hyperparameter tuning, and evaluation.

In [1]:
# Import necessary libraries for preprocessing, model selection, and evaluation
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


 #### Load the Dataset
Load the preprocessed and cleaned dataset, ready for training.

In [2]:
# Load the cleaned dataset
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv"
df = pd.read_csv(file_path)

# Preview the first few rows of the dataset
print("First few rows:")
print(df.head())

# Get a summary of the dataset
print("\nSummary statistics:")
print(df.describe(include='all'))

# Check for data types and non-null counts for each feature
print("\nInformation about each feature:")
print(df.info())


First few rows:
                             categories          pnns_groups_1  \
0                    Dark chocolate bar          sugary snacks   
1                                Gemüse  fruits and vegetables   
2  Beverages and beverages preparations              beverages   
3       Plant-based foods and beverages              beverages   
4                             Beverages              beverages   

                      pnns_groups_2                       food_groups  \
0                chocolate products                chocolate-products   
1                        vegetables                        vegetables   
2  artificially sweetened beverages  artificially-sweetened-beverages   
3  artificially sweetened beverages  artificially-sweetened-beverages   
4  artificially sweetened beverages  artificially-sweetened-beverages   

  nutriscore_grade  energy-kcal_100g   fat_100g  saturated-fat_100g  \
0                e          578.0000  40.900000           21.500000   
1     

In [3]:
# List all features (columns) in the DataFrame
features = df.columns.tolist()
print("List of features in the DataFrame:")
print(features)


List of features in the DataFrame:
['categories', 'pnns_groups_1', 'pnns_groups_2', 'food_groups', 'nutriscore_grade', 'energy-kcal_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'nutrition-score-fr_100g']


In [4]:
# Remove the 'categories' and 'nutrition-score-fr_100g' columns
df = df.drop([ 'nutrition-score-fr_100g'], axis=1)

# Save the modified DataFrame to a new CSV file
output_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_2.csv"
df.to_csv(output_path, index=False)


In [5]:
# List all features (columns) in the DataFrame
features = df.columns.tolist()
print("List of features in the DataFrame:")
print(features)

List of features in the DataFrame:
['categories', 'pnns_groups_1', 'pnns_groups_2', 'food_groups', 'nutriscore_grade', 'energy-kcal_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g']


In [6]:
import pandas as pd

# Load the dataset
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv"
df = pd.read_csv(file_path)

# Remove the specified columns
columns_to_remove = ['pnns_groups_1', 'pnns_groups_2', 'food_groups', 'nutrition-score-fr_100g']
df_modified_v2 = df.drop(columns=columns_to_remove, errors='ignore')

# Save the modified DataFrame to a new CSV file
output_path_v2 = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_3.csv"
df_modified_v2.to_csv(output_path_v2, index=False)


#### Define the Preprocessing Function

The preprocessing function includes:

Sampling the data (optional).
Separating features and target variable.
One-hot encoding of categorical features.
Balancing the classes using SMOTE.
Standardizing the features.
Splitting the data into training and testing sets.

In [7]:
def preprocess_data(df, target_column="nutriscore_grade", test_size=0.2, sample_size=0.2):
    # Step 1: Take a sample of the data if specified
    df_sample = df.sample(frac=sample_size, random_state=42) if sample_size < 1.0 else df.copy()

    # Step 2: Separate features and target variable
    X = df_sample.drop(columns=[target_column])
    y = df_sample[target_column]

    # Step 3: One-hot encode categorical features
    X = pd.get_dummies(X, drop_first=True)

    # Step 4: Handle imbalanced classes using SMOTE
    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Step 5: Standardize the features
    scaler = StandardScaler()
    X_balanced = scaler.fit_transform(X_balanced)

    # Step 6: Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=test_size, random_state=42)

    return X_train, X_test, y_train, y_test


#### Apply Preprocessing to the Dataset

In [8]:
X_train, X_test, y_train, y_test = preprocess_data(df)


 #### Define Models and Hyperparameters for GridSearch

We define three models: Logistic Regression, Random Forest, and SVM, along with a set of hyperparameters for each model to optimize using GridSearchCV.

In [9]:
'''# Define models and hyperparameters
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000, random_state=42), {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    }),
    "Random Forest": (RandomForestClassifier(random_state=42), {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }),
    "SVM": (SVC(random_state=42), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    })
}
''''

SyntaxError: unterminated string literal (detected at line 17) (4170738250.py, line 17)

#### Hyperparameter Tuning with GridSearchCV

We perform hyperparameter tuning on each model using GridSearchCV. For each model, we identify the best parameters that yield the highest accuracy.

In [9]:
'''# Set n_jobs and adjust error_score to capture issues
best_models = {}
for model_name, (model, params) in models.items():
    print(f"\nTuning {model_name} with GridSearchCV...")
    
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=3, scoring='accuracy', n_jobs=1, error_score='raise')
    
    try:
        grid_search.fit(X_train, y_train)
        
        # Capture the best model and parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        
        print(f"Best Parameters for {model_name}: {best_params}")
        print(f"Best Score for {model_name}: {grid_search.best_score_}\n")
        
        # Save the best model
        best_models[model_name] = best_model
        
    except Exception as e:
        print(f"An error occurred during GridSearch for {model_name}: {e}")
'''


Tuning Logistic Regression with GridSearchCV...


KeyboardInterrupt: 

#### Evaluate the Best Model from Each Algorithm

Evaluate each of the best models from GridSearchCV using the test set. Metrics include accuracy, classification report, and confusion matrix

In [None]:
'''for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name}...")
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    print(f"{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))''''


#### Save the Best Model

Based on the evaluation metrics, we choose the best overall model and save it using Joblib. This model will be used for deployment in our Flask application.

In [None]:
'''# Choose the best model based on highest accuracy or other criteria
best_overall_model = max(best_models.items(), key=lambda item: accuracy_score(y_test, item[1].predict(X_test)))[1]

# Save the chosen model
model_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/trained_models/best_model.joblib"
joblib.dump(best_overall_model, model_path)
print(f"Best model saved successfully at: {model_path}")'''
