In [3]:
# test_my_module.py
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))  # Add the parent directory to the Python path
#sys.path.insert(0, basedir)
from data_processing import get_target_variable, preprocess_dish_type, preprocess_meal_type, get_training_testing_data, pre_process_text
from utils import one_hot_encode
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
def sample_data(df, target='calories', sample_size=3, to_csv=False):
    """
    Samples a subset of the DataFrame based on floored values of the target column.

    Parameters:
        df (DataFrame): Input DataFrame containing recipe data.
        target (str): Name of the target column to floor values on. Default is 'calories'.
        sample_size (int): Number of samples to select for each floored value. Default is 3.
        to_csv (bool): If True, saves the sampled DataFrame to 'sample_recipes.csv'. Else returns sample_df. 
    Returns:
        DataFrame or None: Sampled subset of the DataFrame if `to_csv` is False, None otherwise.
    """
    floored_calories = df[target].apply(lambda x: x // 100 * 100).rename('floored_calories')
    df_floored_calories = pd.concat([df, floored_calories], axis=1)
    sample_df = df_floored_calories.groupby('floored_calories', group_keys=False).head(sample_size)
    sample_df = sample_df.drop('floored_calories', axis=1)

    if to_csv:
        sample_df.to_csv('sample_recipes.csv', index=False)
    else:
        return sample_df

raw_df = pd.read_csv('../../recipes.csv')
sample_data(raw_df, to_csv=True)

In [13]:
df = pd.read_csv('sample_recipes.csv')
df.shape

(401, 22)

In [14]:
pre_proc_df = get_target_variable(df)
pre_proc_df.shape

(365, 23)

In [15]:
pre_proc_df, _, _ = preprocess_dish_type(pre_proc_df)
pre_proc_df.shape

(359, 25)

In [16]:
pre_proc_df = preprocess_meal_type(pre_proc_df)
pre_proc_df.shape

(359, 26)

In [17]:
onehot_encoded_df, onehot_encoder = one_hot_encode(pre_proc_df, 'mealTypeRefined')
pre_proc_df = pd.concat([pre_proc_df, onehot_encoded_df], axis=1)
pre_proc_df.shape

(359, 29)

In [18]:
english_stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
pre_proc_df = pre_process_text(df=pre_proc_df, 
                                            column='label', 
                                            stop_words=english_stop_words, 
                                            lemmatizer=lemmatizer, 
                                            tokenizer=word_tokenize)

In [19]:
pre_proc_df.shape

(359, 29)

In [20]:
pre_proc_df['dishTypeSkewedLabels'] = pre_proc_df['dishTypeSkewedLabels'].astype(int)
pre_proc_df['calorieLabels'] = pre_proc_df['binnedCalories'].astype(int)

In [21]:
X_cols = ['mealTypeRefined_breakfast', 'mealTypeRefined_lunch/dinner', 'mealTypeRefined_snack', 'label', 'dishTypeSkewedLabels']
y_col = 'binnedCalories'
X_train, X_test, y_train, y_test, _ = get_training_testing_data(pre_proc_df, X_cols, y_col)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(287, 463)
(72, 463)
(287,)
(72,)


In [25]:
import predict

In [28]:
predict.post_process(3)

{0: '0-299', 1: '300-599', 2: '600-899', 3: '900-1199'}