In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

In [2]:
df = pd.read_csv("diet_recommendations_dataset.csv")
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Weight_kg,Height_cm,BMI,Disease_Type,Severity,Physical_Activity_Level,Daily_Caloric_Intake,Cholesterol_mg/dL,Blood_Pressure_mmHg,Glucose_mg/dL,Dietary_Restrictions,Allergies,Preferred_Cuisine,Weekly_Exercise_Hours,Adherence_to_Diet_Plan,Dietary_Nutrient_Imbalance_Score,Diet_Recommendation
0,P0001,56,Male,58.4,160,22.8,Obesity,Moderate,Moderate,3079,173.3,133,116.3,,Peanuts,Mexican,3.1,96.6,3.1,Balanced
1,P0002,69,Male,101.2,169,35.4,Diabetes,Mild,Moderate,3032,199.2,120,137.1,,Peanuts,Chinese,4.5,63.2,0.6,Low_Carb
2,P0003,46,Female,63.5,173,21.2,Hypertension,Mild,Sedentary,1737,181.0,121,109.6,,Peanuts,Chinese,3.8,57.5,4.6,Low_Sodium
3,P0004,32,Male,58.1,164,21.6,,Mild,Moderate,2657,168.2,144,159.4,,,Mexican,4.3,54.5,0.4,Balanced
4,P0005,60,Male,79.5,197,20.5,Diabetes,Moderate,Sedentary,3496,200.4,172,182.3,Low_Sugar,,Italian,9.8,78.2,4.7,Low_Carb


# Observing Data

In [3]:
df.describe()

Unnamed: 0,Age,Weight_kg,Height_cm,BMI,Daily_Caloric_Intake,Cholesterol_mg/dL,Blood_Pressure_mmHg,Glucose_mg/dL,Weekly_Exercise_Hours,Adherence_to_Diet_Plan,Dietary_Nutrient_Imbalance_Score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,49.857,84.6024,174.817,28.1916,2475.064,199.7179,144.993,136.8676,5.166,74.8843,2.4692
std,18.114267,20.088121,14.33377,8.040136,565.017032,29.080614,20.245712,37.934819,2.847995,14.82638,1.459631
min,18.0,50.0,150.0,13.0,1500.0,150.4,110.0,70.2,0.0,50.0,0.0
25%,35.0,66.6,162.0,22.075,1984.75,174.3,128.0,105.0,2.8,62.0,1.2
50%,50.0,85.2,175.0,27.45,2470.5,199.85,145.0,138.0,5.2,74.2,2.4
75%,66.0,102.0,187.0,33.425,2937.25,224.85,163.0,170.65,7.6,88.2,3.7
max,79.0,119.7,199.0,52.4,3498.0,249.9,179.0,200.0,10.0,100.0,5.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Patient_ID                        1000 non-null   object 
 1   Age                               1000 non-null   int64  
 2   Gender                            1000 non-null   object 
 3   Weight_kg                         1000 non-null   float64
 4   Height_cm                         1000 non-null   int64  
 5   BMI                               1000 non-null   float64
 6   Disease_Type                      796 non-null    object 
 7   Severity                          1000 non-null   object 
 8   Physical_Activity_Level           1000 non-null   object 
 9   Daily_Caloric_Intake              1000 non-null   int64  
 10  Cholesterol_mg/dL                 1000 non-null   float64
 11  Blood_Pressure_mmHg               1000 non-null   int64  
 12  Glucose

In [5]:
df['Diet_Recommendation'].describe()
df['Diet_Recommendation'].unique()

array(['Balanced', 'Low_Carb', 'Low_Sodium'], dtype=object)

In [6]:
df['Disease_Type'].describe()
df['Disease_Type'].unique()

array(['Obesity', 'Diabetes', 'Hypertension', nan], dtype=object)

In [7]:
df['Severity'].describe()
# df['Severity'].unique()

count       1000
unique         3
top       Severe
freq         344
Name: Severity, dtype: object

In [8]:
df['Physical_Activity_Level'].describe()

count         1000
unique           3
top       Moderate
freq           335
Name: Physical_Activity_Level, dtype: object

### Age, gender weight_kg, height_cm, BMI, disease_type, Severity, physical_activity_level, daily_chalorie_intake, weekly_exercise_hours
These are some features which Iam going to take in consideration since the user cannot provide some features which are in dataset

In [9]:
df["Gender"] = df["Gender"].astype("category", copy="False")
df["Disease_Type"] = df["Disease_Type"].astype("category", copy="False")
df["Severity"] = df["Severity"].astype("category", copy="False")
df["Physical_Activity_Level"] = df["Physical_Activity_Level"].astype("category", copy="False")

In [10]:
cateogorical_features = ['Gender', "Disease_Type", "Severity", "Physical_Activity_Level"]
numeric_features = ['Age', "Weight_kg", "Height_cm", "BMI", "Daily_Caloric_Intake", "Weekly_Exercise_Hours", "Cholesterol_mg/dL"
,"Blood_Pressure_mmHg","Glucose_mg/dL"]

In [11]:
features = ['Gender', "Disease_Type", "Severity", "Physical_Activity_Level", 'Age', "Weight_kg", "Height_cm", "BMI", "Daily_Caloric_Intake", "Weekly_Exercise_Hours", "Cholesterol_mg/dL"
,"Blood_Pressure_mmHg","Glucose_mg/dL"]
target = "Diet_Recommendation"

In [12]:
X = df[features]
y = df[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data preprocssing Pipeline

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [15]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), numeric_features)
    ]
)

# Preprocessing + Randomforest

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', max_depth=10))
])

In [18]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
y_pred = model.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Balanced       0.47      0.63      0.53        86
    Low_Carb       0.23      0.13      0.16        47
  Low_Sodium       0.36      0.31      0.34        67

    accuracy                           0.41       200
   macro avg       0.35      0.36      0.35       200
weighted avg       0.38      0.41      0.38       200



In [21]:
import joblib

In [23]:
joblib.dump(model, 'Model103.joblib')

print(f"Model saved")

Model saved
