In [None]:
import pickle
import pprint
import warnings
import numpy as np 
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
warnings.filterwarnings('ignore')

# eda

In [None]:
datapath = 'diabetes_user_profiles_with_mealID.csv'
df = pd.read_csv(datapath)
df

In [None]:
# sample_json = df.iloc[0].to_json()
# with open('sample.json', 'w') as f:
#     f.write(sample_json)

In [None]:
### NaN Value Distribution
nan_dist = df.isna().sum()
nan_dist = nan_dist[nan_dist > 0]
nan_dist = nan_dist.sort_values(ascending=False)
nan_dist = nan_dist / df.shape[0] * 100
nan_dist = nan_dist.round(2)

# plot
plt.figure(figsize=(10, 5))
sns.barplot(x=nan_dist.values, y=nan_dist.index, palette='viridis')
plt.title('NaN Value Distribution')
plt.xlabel('Percentage')
plt.ylabel('Columns')
plt.show()

In [None]:
df.drop(
        columns=[
                'RecordID', 'Name',
                'OtherConditions', 
                'FoodsAvoided', 
                'Intolerances', 
                'TriggerFoods', 
                'DietFollowed',
                'Allergies'
                ],
        inplace=True
        )
df

In [None]:
output_column = ['MealID']
cat_columns = ['Gender', 'Location', 'Occupation', 'DiabetesType', 'FavoriteFoods', 'HealthGoals', 'DietChallenges', 'TraditionalFoods', 'CookingFrequency', 'CookingMethods']
num_columns = ['Age', 'Height', 'Weight', 'DiagnosedYearsAgo', 'FastingGlucose', 'PostprandialGlucose']

In [None]:
## plot cat column distribution 5 x 2
fig, axes = plt.subplots(5, 2, figsize=(20, 20))
for i, column in enumerate(cat_columns):
    sns.countplot(x=column, data=df, ax=axes[i//2, i%2], palette='viridis')
    axes[i//2, i%2].set_title(f'{column} Distribution')
    axes[i//2, i%2].set_xlabel(column)

plt.tight_layout()
plt.show()

In [None]:
## plot num column distribution 3 x 2
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
for i, column in enumerate(num_columns):
    sns.histplot(df[column], ax=axes[i//2, i%2], palette='viridis')
    axes[i//2, i%2].set_title(f'{column} Distribution')
    axes[i//2, i%2].set_xlabel(column)

plt.tight_layout()
plt.show()

In [None]:
def data_pipeline(
                    datapath = 'diabetes_user_profiles_with_mealID.csv',
                    cat_columns = ['Gender', 'Location', 'Occupation', 'DiabetesType', 'FavoriteFoods', 'HealthGoals', 'DietChallenges', 'TraditionalFoods', 'CookingFrequency', 'CookingMethods'],
                    num_columns = ['Age', 'Height', 'Weight', 'DiagnosedYearsAgo', 'FastingGlucose', 'PostprandialGlucose'],
                    output_column = ['MealID']
                    ):
    df = pd.read_csv(datapath)
    df.drop(
            columns=[
                    'RecordID', 'Name',
                    'OtherConditions', 
                    'FoodsAvoided', 
                    'Intolerances', 
                    'TriggerFoods', 
                    'DietFollowed',
                    'Allergies'
                    ],
            inplace=True
            )

    encoder = defaultdict(LabelEncoder)
    for col in cat_columns + output_column:
        encoder[col].fit(df[col])
        df[col] = encoder[col].transform(df[col])

    with open('encoder_meal.pkl', 'wb') as f:
        pickle.dump(encoder, f)

    X = df.drop(columns = cat_columns + output_column)
    Y = df[output_column]

    X, Y = np.array(X), np.array(Y).ravel()
    return X, Y, encoder

In [None]:
X, Y, encoder_meal = data_pipeline()

X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, Y, 
                                                    test_size=0.2, 
                                                    random_state=42
                                                    )

print(f"\nX_train shape: {X_train.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_test shape: {Y_test.shape}")

In [None]:
cat = CatBoostClassifier(
                        iterations=1000, 
                        learning_rate=0.1, 
                        loss_function='MultiClass', 
                        depth=6
                        )
cat.fit(
        X, Y, 
        eval_set=(
                    X_test, 
                    Y_test
                ), 
        verbose=100
        )

In [None]:
with open('model_meal.pkl', 'wb') as f:
    pickle.dump(cat, f)

In [None]:
P_train = cat.predict(X_train)
P_test = cat.predict(X_test)

In [None]:
print("---------------------- Train CLS REPORT ----------------------")
target_names = encoder_meal['MealID'].classes_
target_names = [f'Meal ID : {name}' for name in target_names]
clf_report = classification_report(
                                Y_train,
                                P_train,
                                target_names = target_names
                )
print(clf_report)

print("---------------------- Test CLS REPORT ----------------------")
clf_report = classification_report(
                                Y_test, 
                                P_test,
                                target_names = target_names
                                )
print(clf_report)

In [None]:
cm = confusion_matrix(Y_train,P_train)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = np.round(cm, 2)

plt.figure(figsize=(15, 10))
sns.heatmap(cm, annot=True, fmt=".2f", cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Train Confusion Matrix')
plt.show()

In [None]:
cm = confusion_matrix(Y_test,P_test)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = np.round(cm, 2)

plt.figure(figsize=(15, 10))
sns.heatmap(cm, annot=True, fmt=".2f", cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Test Confusion Matrix')
plt.show()

### Inference

In [None]:
with open('encoder_meal.pkl', 'rb') as f:
    encoder_meal = pickle.load(f)

with open('model_meal.pkl', 'rb') as f:
    model_meal = pickle.load(f)

In [None]:
def inference_meal(
                    sample_json,
                    meal_path = 'sri_lankan_meal_dataset.csv',
                    cat_columns = ['Gender', 'Location', 'Occupation', 'DiabetesType', 'FavoriteFoods', 'HealthGoals', 'DietChallenges', 'TraditionalFoods', 'CookingFrequency', 'CookingMethods']
                    ):
    df = pd.DataFrame([sample_json])
    df.drop(
        columns=[
                'RecordID', 'Name',
                'OtherConditions', 
                'FoodsAvoided', 
                'Intolerances', 
                'TriggerFoods', 
                'DietFollowed',
                'Allergies'
                ],
        inplace=True
    )

    for col in cat_columns:
        df[col] = encoder_meal[col].transform(df[col])

    x = df.values
    p = model_meal.predict(x)
    p = int(p.squeeze())
    p = encoder_meal['MealID'].inverse_transform([p])[0]
    
    df_meal = pd.read_csv(meal_path)
    df_meal = df_meal[df_meal['MealID'] == p]
    del df_meal['MealID']

    meal_dict = df_meal.to_dict(orient='records')
    
    response = {}
    response['MealID'] = p
    response['Meal'] = meal_dict
    return response
    

In [None]:
sample_json = {
            "RecordID":1,
            "Name":"Nimal Fernando",
            "Age":64,
            "Gender":"Male",
            "Height":168,
            "Weight":80,
            "Location":"Jaffna",
            "Occupation":"Homemaker",
            "DiabetesType":"Gestational",
            "DiagnosedYearsAgo":2,
            "FastingGlucose":138,
            "PostprandialGlucose":235,
            "OtherConditions":"High cholesterol",
            "FavoriteFoods":"Rice and curry",
            "FoodsAvoided":None,
            "DietFollowed":"Vegetarian",
            "TriggerFoods":None,
            "Allergies":"Dairy",
            "Intolerances":"Gluten",
            "HealthGoals":"Better blood sugar control",
            "DietChallenges":"Cravings",
            "TraditionalFoods":"Pickled vegetables",
            "CookingFrequency":"Rarely",
            "CookingMethods":"Steaming"
            }

In [None]:
df_meal = inference_meal(sample_json)
pprint.pprint(df_meal)