In [1]:
import pandas as pd
import ast
from processing_functions import round_up_to_nearest, round_down_to_nearest, filter_calories, sorted_binned_encoding, collapsing_to_priority
import warnings

ModuleNotFoundError: No module named 'processing_functions'

In [49]:
raw_df = pd.read_csv('../recipes.csv')

#dropping duplicates from recipe name (which is the label column) because some sources give the same recipes
df = raw_df.drop_duplicates('label')

In [82]:
#handling target variable first
calories_df = df['calories']

#capping the calorie count, so we will include recipes with calorie counts so that we maintain 90% of our data
filtered_calories_df = filter_calories(df, column='calories', quartile_percent=0.9)
max_calorie_cutoff = round_up_to_nearest(max(filtered_calories_df))

In [83]:
#binning the calorie count to turn this into a classification problem
bin_edges = [i for i in range(0, int(max_calorie_cutoff)+1, 300)]
labels = [f"{bin_edges[i]}-{bin_edges[i+1]-1}" for i in range(len(bin_edges)-1)]
binned_calories = pd.cut(filtered_calories_df, bins=bin_edges, labels=labels, include_lowest=True)

In [84]:
binned_calories = binned_calories.rename('binnedCalories')

In [85]:
# Assuming binned_calories is a pandas Series or DataFrame
assert binned_calories.isna().sum() == 0, "The count of NaN values in binned_calories is not equal to 0"

In [86]:
label_encoding = sorted_binned_encoding(binned_calories)
target = binned_calories.map(label_encoding)

In [87]:
binned_calories_df = df.loc[target.index]

print(binned_calories_df.shape)

(10494, 22)


In [88]:
binned_calories_df = pd.concat([binned_calories_df, target], axis=1)
binned_calories_df.shape

(10494, 23)

In [89]:
from collections import Counter

In [90]:
dishType_df = binned_calories_df.dropna(subset=['dishType'])
dishType_df = dishType_df['dishType'].apply(ast.literal_eval)

In [91]:
def get_values(df, column):
    df.loc[df[column].isna(), column] = '[]'
    labels_lst = []
    for label in df[column].apply(ast.literal_eval):
        labels_lst += label
    return labels_lst

In [92]:
Counter(get_values(binned_calories_df, 'dishType'))

Counter({'main course': 3381,
         'starter': 1757,
         'condiments and sauces': 1217,
         'desserts': 1080,
         'salad': 1023,
         'sandwiches': 416,
         'soup': 411,
         'drinks': 377,
         'bread': 251,
         'cereals': 206,
         'alcohol cocktail': 168,
         'biscuits and cookies': 116,
         'pancake': 81,
         'egg': 40,
         'preserve': 11,
         'special occasions': 11,
         'omelet': 7,
         'christmas': 7,
         'preps': 6,
         'thanksgiving': 2,
         'new year': 1,
         'cinco de mayo': 1})

In [93]:
priority_list_dish_type = [
    'main course',
    'starter',
    'salad',
    'soup',
    'drinks',
    'bread',
    'desserts',
    'condiments and sauces',
    'sandwiches',
    'cereals',
    'alcohol cocktail',
    'biscuits and cookies',
    'pancake',
    'egg',
    'preserve',
    'omelet',
    'special occasions',
    'christmas',
    'preps',
    'thanksgiving',
    'cinco de mayo'
]

In [94]:
new_dish_type_df = []
for dish_type_lst in dishType_df:
    if len(dish_type_lst) == 1:
        new_dish_type_df.append(dish_type_lst[0])
    else:
        for priority_dish_type in priority_list_dish_type:
            for dish_type in 
            if priority_dish_type in dish_type_lst:
                new_dish_type_df.append(priority_dish_type)
                break
            else:
                #if for some reason none of the dish types in the list are in the priority list, just append mainn course which is the most frequent
                new_dish_type_df.append(priority_list_dish_type[0])

SyntaxError: invalid syntax (3484438017.py, line 7)

In [95]:
def collapsing_to_priority_test(type_lst, priority_list):
    if len(type_lst) == 1:
        return type_lst[0]
    else:
        for priority_item in priority_list:
            for item in type_lst:
                if priority_item == item:
                    return item
        else:
            warnings.warn("No item in the priority list was found, returning the first priority list item.")
            return priority_list[0]

In [96]:
Counter([collapsing_to_priority_test(dish_lst, priority_list_dish_type) for dish_lst in dishType_df.to_list()])

Counter({'main course': 3381,
         'starter': 1744,
         'condiments and sauces': 1207,
         'desserts': 1068,
         'salad': 970,
         'sandwiches': 407,
         'soup': 394,
         'drinks': 377,
         'bread': 250,
         'cereals': 204,
         'alcohol cocktail': 168,
         'biscuits and cookies': 116,
         'pancake': 81,
         'preserve': 11,
         'preps': 5,
         'special occasions': 2})

In [97]:
binned_calories_df = binned_calories_df.dropna(subset=['dishType'])
dishType_df = binned_calories_df['dishType'].apply(ast.literal_eval)
dishType_df = dishType_df.rename('dishTypeLabel')

dishType_df = dishType_df.apply(lambda x: collapsing_to_priority(x, priority_list_dish_type))



In [98]:
pre_processed_df = pd.concat([binned_calories_df, dishType_df], axis=1)
print(dishType_df.shape)

(10494,)


In [99]:
pre_processed_df.columns

Index(['uri', 'label', 'image', 'source', 'url', 'shareAs', 'yield',
       'dietLabels', 'healthLabels', 'cautions', 'ingredientLines',
       'ingredients', 'calories', 'totalWeight', 'totalTime', 'cuisineType',
       'mealType', 'dishType', 'totalNutrients', 'totalDaily', 'digest',
       'tags', 'binnedCalories', 'dishTypeLabel'],
      dtype='object')

In [121]:
skewness_by_category = pre_processed_df.groupby('dishTypeLabel')['calories'].skew()
skewness_by_category

dishTypeLabel
alcohol cocktail         0.884605
biscuits and cookies    -0.204065
bread                    0.189855
cereals                  1.315521
condiments and sauces    0.739704
desserts                 0.386819
drinks                   1.554733
main course              0.520799
pancake                  0.277986
preps                    0.593710
preserve                 0.583254
salad                    0.926322
sandwiches               0.637394
soup                     0.585448
starter                  0.776458
Name: calories, dtype: float64

In [122]:
skewness_min = skewness_by_category.min()
skewness_max = skewness_by_category.max()

interval_width = (skewness_max - skewness_min) / 3
bin1_end = skewness_min + interval_width
bin2_end = bin1_end + interval_width

bins = {
    'Left Skewed (Higher Calories)': skewness_by_category[(skewness_by_category >= skewness_min) & (skewness_by_category < bin1_end)],
    'Approximately Symmetric (Normal Calories)': skewness_by_category[(skewness_by_category >= bin1_end) & (skewness_by_category < bin2_end)],
    'Right Skewed (Lower Calories)': skewness_by_category[skewness_by_category >= bin2_end]
}



In [123]:
skew_map = {}
for skew in bins.keys():
    for category in bins[skew].index:
        skew_map[category] = skew
        #print(category)

skew_map

{'biscuits and cookies': 'Left Skewed (Higher Calories)',
 'bread': 'Left Skewed (Higher Calories)',
 'pancake': 'Left Skewed (Higher Calories)',
 'alcohol cocktail': 'Approximately Symmetric (Normal Calories)',
 'condiments and sauces': 'Approximately Symmetric (Normal Calories)',
 'desserts': 'Approximately Symmetric (Normal Calories)',
 'main course': 'Approximately Symmetric (Normal Calories)',
 'preps': 'Approximately Symmetric (Normal Calories)',
 'preserve': 'Approximately Symmetric (Normal Calories)',
 'salad': 'Approximately Symmetric (Normal Calories)',
 'sandwiches': 'Approximately Symmetric (Normal Calories)',
 'soup': 'Approximately Symmetric (Normal Calories)',
 'starter': 'Approximately Symmetric (Normal Calories)',
 'cereals': 'Right Skewed (Lower Calories)',
 'drinks': 'Right Skewed (Lower Calories)'}

In [124]:
pre_processed_df['dishTypeSkewedLabels'] = pre_processed_df['dishTypeLabel'].map(skew_map)

In [125]:
pre_processed_df['dishTypeSkewedLabels'].unique()

array(['Approximately Symmetric (Normal Calories)',
       'Right Skewed (Lower Calories)', 'Left Skewed (Higher Calories)'],
      dtype=object)

In [126]:
pre_processed_df = pre_processed_df.dropna(subset=['dishTypeSkewedLabels'])
pre_processed_df = pre_processed_df.reset_index(drop=True)

In [127]:
dish_type_map = {'Approximately Symmetric (Normal Calories)': 1, 'Right Skewed (Lower Calories)': 0, 'Left Skewed (Higher Calories)': 2}

In [128]:
pre_processed_df['dishTypeSkewedLabels'].map(dish_type_map).unique()

array([1, 0, 2], dtype=int64)

In [108]:
pre_processed_df['dishTypeSkewedLabels'] = pre_processed_df['dishTypeSkewedLabels'].map(dish_type_map)

In [110]:
pre_processed_df.columns

Index(['uri', 'label', 'image', 'source', 'url', 'shareAs', 'yield',
       'dietLabels', 'healthLabels', 'cautions', 'ingredientLines',
       'ingredients', 'calories', 'totalWeight', 'totalTime', 'cuisineType',
       'mealType', 'dishType', 'totalNutrients', 'totalDaily', 'digest',
       'tags', 'binnedCalories', 'dishTypeLabel', 'dishTypeSkewedLabels'],
      dtype='object')

In [111]:
pre_processed_df.shape

(10492, 25)

In [4]:
import pandas as pd
import ast
from sklearn.preprocessing import OneHotEncoder
from utils import round_up_to_nearest, round_down_to_nearest, filter_calories, sorted_binned_encoding, collapsing_to_priority, priority_list_meal_type, priority_list_dish_type, one_hot_encode, remove_stop_words, lemmatization
from data_processing import get_target_variable, preprocess_dish_type, preprocess_meal_type

In [5]:
raw_df = pd.read_csv('../recipes.csv')
df = raw_df.drop_duplicates('label')

In [6]:
pre_processed_df = get_target_variable(df)
pre_processed_df = preprocess_dish_type(pre_processed_df)

In [22]:
mealType_df = pre_processed_df['mealType'].apply(ast.literal_eval)

In [7]:
priority_list_meal_type_var = priority_list_meal_type()

In [23]:
mealType_df = mealType_df.apply(lambda x: collapsing_to_priority(x, priority_list_meal_type_var))

replace_lst = ['brunch', 'teatime']
replacement = 'snack'
mealType_df = mealType_df.apply(lambda x: replacement if x in replace_lst else x)

In [31]:
mealType_df = mealType_df.rename('mealTypeRefined')

In [35]:
pre_processed_df = pd.concat([pre_processed_df, mealType_df], axis=1)

In [4]:

def one_hot_encode(df, column):
    onehot_encoder = OneHotEncoder()
    
    # Fit and transform the column to one-hot encoded format
    onehot_encoded = onehot_encoder.fit_transform(df[[column]])
    onehot_encoded_array = onehot_encoded.toarray()
    onehot_encoded_df = pd.DataFrame(onehot_encoded_array, columns=onehot_encoder.get_feature_names_out([column]))

    return onehot_encoded_df

In [38]:
pre_processed_df = pd.concat([pre_processed_df, onehot_encoded_df], axis=1)
pre_processed_df = pre_processed_df.drop('mealTypeRefined', axis=1)

In [40]:
pre_processed_df.columns

Index(['mealTypeRefined_breakfast', 'mealTypeRefined_lunch/dinner',
       'mealTypeRefined_snack', 'uri', 'label', 'image', 'source', 'url',
       'shareAs', 'yield', 'dietLabels', 'healthLabels', 'cautions',
       'ingredientLines', 'ingredients', 'calories', 'totalWeight',
       'totalTime', 'cuisineType', 'mealType', 'dishType', 'totalNutrients',
       'totalDaily', 'digest', 'tags', 'binnedCalories', 'dishTypeLabel',
       'dishTypeSkewedLabels'],
      dtype='object')

In [41]:
def preprocess_meal_type(pre_processed_df):
    mealType_df = pre_processed_df['mealType'].apply(ast.literal_eval)

    #converting multilabel column into single label
    priority_list_meal_type_var = priority_list_meal_type()
    mealType_df = mealType_df.apply(lambda x: collapsing_to_priority(x, priority_list_meal_type_var))

    #replacing brunch and teatime with snack, effectively combining these categories
    replace_lst = ['brunch', 'teatime']
    replacement = 'snack'
    mealType_df = mealType_df.apply(lambda x: replacement if x in replace_lst else x)

    mealType_df = mealType_df.rename('mealTypeRefined')
    pre_processed_df = pd.concat([pre_processed_df, mealType_df], axis=1)

    return pre_processed_df    

In [7]:
pre_processed_df = preprocess_meal_type(pre_processed_df)

In [8]:
onehot_encoded_df = one_hot_encode(pre_processed_df, 'mealTypeRefined')

In [9]:
pre_processed_df = pd.concat([pre_processed_df, onehot_encoded_df], axis=1)

In [10]:
pre_processed_df.columns

Index(['uri', 'label', 'image', 'source', 'url', 'shareAs', 'yield',
       'dietLabels', 'healthLabels', 'cautions', 'ingredientLines',
       'ingredients', 'calories', 'totalWeight', 'totalTime', 'cuisineType',
       'mealType', 'dishType', 'totalNutrients', 'totalDaily', 'digest',
       'tags', 'binnedCalories', 'dishTypeLabel', 'dishTypeSkewedLabels',
       'mealTypeRefined', 'mealTypeRefined_breakfast',
       'mealTypeRefined_lunch/dinner', 'mealTypeRefined_snack'],
      dtype='object')

In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [12]:
english_stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [13]:
def pre_process_text(df, column, stop_words, lemmatizer, inplace=False):
    recipes = df[column].copy()  # Make a copy to avoid modifying the original DataFrame inplace

    recipes = recipes.apply(lambda x: remove_stop_words(x, stop_words))
    recipes = recipes.apply(lambda x: lemmatization(x, lemmatizer))
    recipes = recipes.apply(lambda x: word_tokenize(x))

    if inplace:
        df.loc[:, column] = recipes
        return None  # Return None if inplace=True to indicate that the DataFrame is modified inplace
    else:
        new_df = df.copy()  # Make a copy of the original DataFrame
        new_df.loc[:, column] = recipes
        return new_df

In [14]:
pre_process_text(df=pre_processed_df, column='label', stop_words=english_stop_words, lemmatizer=lemmatizer, inplace=True)

In [15]:
pre_processed_df['binnedCalories']

0        0
1        1
2        3
3        0
4        0
        ..
10378    8
10379    1
10380    5
10381    1
10382    2
Name: binnedCalories, Length: 10383, dtype: category
Categories (13, int64): [0 < 1 < 2 < 3 ... 9 < 10 < 11 < 12]

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
def get_training_testing_data(df, X_columns, y_column, test_size=0.20, random_state=42):
    X = df[X_columns]
    y = df[y_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    tfidf = TfidfVectorizer()
    
    tfidf_X_train_labels = tfidf.fit_transform(X_train['label'].str.join(' '))
    tfidf_X_test_labels = tfidf.transform(X_test['label'].str.join(' '))

    tfidf_train_df = pd.DataFrame(tfidf_X_train_labels.toarray(), columns=tfidf.get_feature_names_out())
    tfidf_test_df = pd.DataFrame(tfidf_X_test_labels.toarray(), columns=tfidf.get_feature_names_out())
    
    X_train_tfidf = pd.concat([tfidf_train_df, X_train.drop('label', axis=1)], axis=1)
    X_test_tfidf = pd.concat([tfidf_test_df, X_test.drop('label', axis=1)], axis=1)

    return X_train_tfidf, X_test_tfidf, y_train, y_test

In [18]:
X_cols = ['mealTypeRefined_breakfast', 'mealTypeRefined_lunch/dinner', 'mealTypeRefined_snack', 'label', 'dishTypeSkewedLabels']
y_col = 'binnedCalories'
X_train, X_test, y_train, y_test = get_training_testing_data(pre_processed_df, X_cols, y_col)

In [4]:
from sklearn.metrics import cohen_kappa_score, make_scorer, accuracy_score, r2_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import pandas as pd
import cupy as cp

In [5]:
cp.cuda.Device(0).use()

<CUDA Device 0>

In [6]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [7]:
X_train_gpu, y_train_gpu = cp.array(X_train.values), cp.array(y_train.values)

In [8]:
X_train_gpu

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 0., 1.]])

In [9]:
parameters = {
    'learning_rate': [0.1], 
    #'max_depth': [3, 5, 7],
    #'colsample_bytree': [0.6, 0.8, 1.0],
    #'n_estimators': [50, 100, 150]
}

xgb_clf = XGBClassifier(objective='multi:softmax', 
                             num_class=13, 
                             random_state=42, 
                             device = "cuda")

clf = GridSearchCV(xgb_clf, parameters, scoring='accuracy', n_jobs=-1)

clf.fit(X_train_gpu, y_train_gpu)

TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.

In [51]:
clf.predict(X_train)

array([2, 2, 8, ..., 2, 1, 5])

In [10]:
# Convert dataframes to Cupy arrays
X_train_gpu = cp.array(X_train.values)
y_train_gpu = cp.array(y_train.values)

# Initialize XGBoost regressor
clf = XGBClassifier(objective='multi:softmax', 
                             num_class=13, 
                             random_state=42, 
                             device = "cuda")

# Fit the model
clf.fit(X_train_gpu, y_train_gpu)

In [11]:
# Now you can use predict on GPU data
clf.predict(X_train_gpu)

array([9, 2, 8, ..., 4, 1, 5])

In [13]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score
import os

In [14]:
mlflow_directory = "mlruns"  # Relative path to the mlruns folder
latest_run_id = max(os.listdir(mlflow_directory))

In [17]:
# Load the MLflow model for the latest run
model_path = f"mlruns/{latest_run_id}/xgboost_model"
model_path = "mlruns/0/03506decd126452d9ec8b6280ce463f6/artifacts/xgboost_model"

In [18]:
loaded_model = mlflow.sklearn.load_model(model_path)

In [20]:
max(os.listdir("mlruns/0"))

'meta.yaml'

In [28]:
mlruns_path = "mlruns/0"

# Get a list of directories in mlruns/0
run_directories = [d for d in os.listdir(mlruns_path) if os.path.isdir(os.path.join(mlruns_path, d))]

# Sort directories based on creation time (modification time)
latest_run_directory_id = max(run_directories, key=lambda d: os.path.getmtime(os.path.join(mlruns_path, d)))

In [29]:
latest_run_directory_id

'03506decd126452d9ec8b6280ce463f6'

In [37]:
model_path = f"mlruns/0/{latest_run_directory_id}/artifacts/xgboost_model/"

In [39]:
loaded_model = mlflow.sklearn.load_model(model_path)

In [51]:
sample_input = X_test.iloc[2].to_numpy().reshape(1, -1)
sample_input

array([[0., 0., 0., ..., 1., 0., 1.]])

In [52]:
loaded_model.predict(sample_input)

array([2])

In [1]:
import sys
import os

In [14]:
sys.path.append(r'C:\Users\RaviB\GitHub\FlavorQuasar\calorie_predicter')

In [2]:
os.path.abspath(os.path.join(os.getcwd(), ".."))

'C:\\Users\\RaviB\\GitHub\\FlavorQuasar'

In [3]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
from calorie_predicter.utils import (
    round_up_to_nearest,
    filter_calories,
    sorted_binned_encoding,
    collapsing_to_priority,
    priority_list_dish_type,
    priority_list_meal_type,
    one_hot_encode,
    pre_process_text
)

In [6]:
from nltk.corpus import stopwords

In [9]:
english_stop_words = stopwords.words('english')

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RaviB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True