In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [206]:
# Load the datasets
recipes_df = pd.read_csv('Datasets/recipes.csv')
reviews_df = pd.read_csv('Datasets/reviews.csv')

In [207]:
recipes_df.shape

(522517, 28)

In [208]:
# Parsing Time Duration Strings
def parse_duration(duration_str):
    
    if pd.isnull(duration_str):
        return None  
    hours = minutes = 0
    hours_match = re.search(r'(\d+)H', duration_str)
    minutes_match = re.search(r'(\d+)M', duration_str)
    if hours_match:
        hours = int(hours_match.group(1))
    if minutes_match:
        minutes = int(minutes_match.group(1))

    return hours * 60 + minutes

In [209]:
def parsing_values(values_str):
    
    if pd.notna(values_str):
        values_str = values_str.replace('c(', '(')
        # Remove non-alphanumeric characters except commas, spaces, and '&'
        values_str = re.sub(r'[^\w\s,&]', '', values_str)
        # Remove brackets and quotes
        values_str = re.sub(r'[\[\]\'\"]', '', values_str)

    return values_str

In [210]:
def parsing_imageURL(values_imageURL):
    
    if pd.notna(values_imageURL) and values_imageURL.strip() != 'character(0)':
        urls = re.findall(r'https?://[^\s"]+', values_imageURL)
        image_urls_mod = [url.rstrip('",') for url in urls]
        return image_urls_mod[0]
    else:
        return pd.NA

In [211]:
recipes_df['CookTimeMin'] = recipes_df['CookTime'].apply(parse_duration)
recipes_df['PrepTimeMin'] = recipes_df['PrepTime'].apply(parse_duration)
recipes_df['TotalTimeMin'] = recipes_df['TotalTime'].apply(parse_duration)
recipes_df['Images'] = recipes_df['Images'].apply(parsing_imageURL)
recipes_df['Keywords'] = recipes_df['Keywords'].apply(parsing_values)
recipes_df['RecipeInstructions'] = recipes_df['RecipeInstructions'].apply(parsing_values)

recipes_df.drop(['CookTime', 'PrepTime', 'TotalTime'], axis=1, inplace=True)

In [212]:
# Get unique values of 'Images' column and convert to list
unique_images_list = recipes_df['Images'].unique().tolist()

# Specify the file path where you want to save the text file
file_path = "unique_images.txt"

# Write the list to the text file
with open(file_path, 'w') as file:
    for item in unique_images_list:
        file.write("%s\n" % item)


In [213]:
# Impute 'CookTime' with median (or mean, based on your preference)
cook_time_median = recipes_df['CookTimeMin'].median()
recipes_df['CookTimeMin'].fillna(cook_time_median, inplace=True)

# Drop NA for 'Description' and 'Keywords' columns
recipes_df.dropna(subset=['Description', 'Keywords'], inplace=True)

# # Assign 'Other' to missing 'RecipeCategory'
recipes_df['RecipeCategory'].fillna('Other', inplace=True)

# # Drop NA for 'Images' and 'RecipeIngredientQuantities' columns
recipes_df.dropna(subset=['RecipeIngredientQuantities'], inplace=True)

# recipes_df.dropna(subset=['Images'], inplace=True)


# recipes_df['AggregatedRating'].fillna(recipes_df['AggregatedRating'].median(), inplace=True)
# recipes_df['ReviewCount'].fillna(0, inplace=True)

# # 'RecipeServings' and 'RecipeYield' set to median and placeholder value
# recipes_df['RecipeServings'].fillna(recipes_df['RecipeServings'].median(), inplace=True)
# recipes_df['RecipeYield'].fillna('Varies', inplace=True)

# # Reviews Dataset Handling

# # Drop NA reviews
# reviews_df.dropna(subset=['Review'], inplace=True)

In [107]:
recipes_df.drop_duplicates(subset=['RecipeId'], keep='first', inplace=True)

In [106]:
unique_rows_df_new.shape

(522517, 29)

In [108]:
recipes_df.shape


(522517, 29)

In [65]:
recipes_df['HealthScore'] = (recipes_df['ProteinContent'] - recipes_df['FatContent']) / recipes_df['Calories']

In [71]:
# Assuming recipes_df is your DataFrame containing the HealthScore column
# Step 1: Handling Negative Values, infs and NaNs
recipes_df['HealthScore'] = recipes_df['HealthScore'].replace([np.inf, -np.inf], np.nan)  # Replace infinite values with NaN
recipes_df['HealthScore'] = recipes_df['HealthScore'].fillna(0)  # Replace NaNs with 0
recipes_df['HealthScore'] = recipes_df['HealthScore'].apply(lambda x: max(0, x)) 

# Step 2: Scaling the Values (Min-Max Normalization)
min_score = recipes_df['HealthScore'].min()
max_score = recipes_df['HealthScore'].max()
recipes_df['ScaledHealthScore'] = (recipes_df['HealthScore'] - min_score) / (max_score - min_score)

# Step 3: Binning the Values
bins = [0, 0.5, 1]  # Define bin edges
labels = ['unhealthy', 'healthy']  # Define labels for bins
recipes_df['HealthCategory'] = pd.cut(recipes_df['ScaledHealthScore'], bins=bins, labels=labels, right=True, include_lowest=True)

# Keep only the HealthCategory column and remove others
recipes_df.drop(columns=['HealthScore', 'ScaledHealthScore'], inplace=True)

# Display the DataFrame with only the HealthCategory column
print(recipes_df['HealthCategory'])


0         unhealthy
1         unhealthy
2         unhealthy
3         unhealthy
4         unhealthy
            ...    
522512    unhealthy
522513    unhealthy
522514    unhealthy
522515    unhealthy
522516    unhealthy
Name: HealthCategory, Length: 522517, dtype: category
Categories (2, object): ['unhealthy' < 'healthy']


In [72]:
recipes_df['HealthCategory'].unique().tolist()

['unhealthy', 'healthy']

In [None]:
# Select relevant columns
relevant_columns = ['Name', 'Description', 'Keywords',
                    'RecipeInstructions', 'DifficultyLevel', 'HealthCategory', 'Images', 'Rating']
recipes_df = recipes_df[relevant_columns]

# Main text data to be fed into transformer model
recipes_df['text_data'] = recipes_df.apply(
    lambda x: f"<name> {x['Name']}\n<description> {x['Description']}\n<keywords> {x['Keywords']}\n<instructions> {x['RecipeInstructions']}\n<difficulty> {x['DifficultyLevel']}\n<health> {x['HealthScore']}\n<rating> {x['Rating']}\n<images> {x['FormattedImages']}", axis=1)
