# Rating recipies 

**Name(s)**: Varnika Chandra

**Website Link**: https://varnikachandra.github.io/Recipes_and_Ratings_Analysis/

In [1]:
import pandas as pd
import numpy as np
import ast
import os
import seaborn as sns
os.makedirs("assets", exist_ok=True)
import plotly.express as px


# from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

## Step 1: Introduction

In [2]:
'''
Some questions that interested me about the dataset include:
What types of recipes tend to have the most calories?
What types of recipes tend to have higher average ratings?
What is the relationship between the cooking time and average rating of recipes?
What is the relationship between nutrition and cooking time?
What is the relationship between nutrition and average ratings of recipes? 

The question I chose to investigate further is: 
What makes a recipe healthy? 

'''
#Load the datasets
recipes_df = pd.read_csv("data/RAW_recipes.csv")
ratings_df = pd.read_csv("data/RAW_interactions.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/RAW_recipes.csv'

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
#Rename 'id' to 'recipe_id' to merge
recipes_df = recipes_df.rename(columns={'id': 'recipe_id'})

#Normalize column names to be consistent
def normalize_col_names(df):
    df.columns=(
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(' ', '_')
        .str.replace(r'[^\w\s]', '', regex=True)
    )
    return df
recipes_df = normalize_col_names(recipes_df)
ratings_df = normalize_col_names(ratings_df)

#Drop duplicate reviews if same user reviewed a recipe twice
ratings_df = ratings_df.drop_duplicates(subset=['user_id', 'recipe_id'])

#Remove ratings of 0
ratings_df = ratings_df[ratings_df['rating'] != 0]

#Replace 'minutes'=0 with NaN, then fill with median of reasonable durations (< 48 hours)
reasonable_minutes=recipes_df[recipes_df['minutes']<2880]['minutes']
median_minutes=reasonable_minutes.median()
recipes_df['minutes'] = recipes_df['minutes'].replace(0, np.nan)
recipes_df['minutes'] = recipes_df['minutes'].fillna(median_minutes)
recipes_df=recipes_df[recipes_df['minutes'] <= 2880]

#Fill any missing descriptions with "NA"
recipes_df['description']=recipes_df['description'].fillna("NA")

#Convert 'submitted' column to datetime and extract the year into a new column
recipes_df['submitted']=pd.to_datetime(recipes_df['submitted'], errors='coerce')
recipes_df['year_submitted']=recipes_df['submitted'].dt.year

#Compute average rating and review count per recipe from interactions
rating_summary=ratings_df.groupby('recipe_id').agg(
    avg_rating=('rating', 'mean'),
    num_reviews=('rating', 'count')
).reset_index()

#Convert stringified lists to actual lists
recipes_df['tags']=recipes_df['tags'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

#Keywords for healthy recipies
health_keywords=['healthy', 'low-fat', 'low sodium', 'anti-inflammatory', 'low calorie', 'gluten-free', 'sugar-free', 'high-protein', 'paleo']

#Function to check if any health keyword is in a recipe's tag list
def has_health_tag(tags):
    return any(kw in tags for kw in health_keywords)
    
#Create a binary column for whether a recipe is "health-tagged"
recipes_df['is_health_tagged']=recipes_df['tags'].apply(has_health_tag).astype(int)

#Convert string-formatted nutrition list into actual Python lists for all the elements
recipes_df['nutrition']=recipes_df['nutrition'].apply(ast.literal_eval)

#Define nutrition column names and expand them into separate columns
nutrition_cols=['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']
nutrition_df=pd.DataFrame(recipes_df['nutrition'].tolist(), columns=nutrition_cols)

#Merge nutrition columns back with recipes_df and drop original 'nutrition' column
recipes_df=pd.concat([recipes_df.drop(columns=['nutrition']), nutrition_df], axis=1)

#Merge recipe-level rating summary into recipes_df using 'recipe_id'
recipes_df=recipes_df.merge(rating_summary, on='recipe_id', how='left')

#Ensure nutrition columns are numeric and fill any missing values with median
recipes_df[nutrition_cols]=recipes_df[nutrition_cols].apply(pd.to_numeric, errors='coerce')
recipes_df[nutrition_cols]=recipes_df[nutrition_cols].fillna(recipes_df[nutrition_cols].median())

#Create additional numerical features
recipes_df['calories_per_ingredient']=recipes_df['calories']/recipes_df['n_ingredients']

#If the recipe is considered quick
recipes_df['is_quick']=(recipes_df['minutes']<= 30).astype(int)

#Popular recipies
recipes_df['is_popular']=(recipes_df['num_reviews'] >= 100).astype(int)

#Protien to calories ratio
recipes_df['protein_per_calorie']=recipes_df['protein']/recipes_df['calories']

#Sugar to calories ratio
recipes_df['sugar_per_calorie']=recipes_df['sugar']/recipes_df['calories']

recipes_df.head(10).to_html("preview_table.html", index=False)

In [None]:
#Univariate Analysis
healthy_df=recipes_df[recipes_df['is_health_tagged'] == 1]
healthy_df=healthy_df[healthy_df['minutes'] <= 2880]
def cap_percentile(df, column, percentile=95):
    return np.percentile(df[column], percentile)


fig=px.box(recipes_df, y="minutes", title="Box Plot of Recipe Preparation Time")
fig.update_yaxes(range=[0, 180])  # Optional zoom
fig.show()

#Plot: Calories
fig=px.box(healthy_df, y='calories', title='Box Plot of Calories in Health-Tagged Recipes')
fig.update_yaxes(range=[100, 1800])
fig.show()

#zoomed in verion
fig.update_yaxes(range=[100, 1000])
fig.show()

#Plot: Sugar
fig_sugar=px.box(healthy_df, y='sugar',
                         title='Distribution of Sugar (Health-Tagged Recipes)')
fig_sugar.update_yaxes(range=[0, 200])
fig_sugar.show()

#Plot: Protein
protein_cap =cap_percentile(healthy_df, 'protein')
fig_protein = px.histogram(healthy_df, x='protein', nbins=150,
                           title='Distribution of Protein (Health-Tagged Recipes)')
fig_protein.update_traces(marker_line_color='white', marker_line_width=1.2)
fig_protein.update_layout(xaxis_range=[0, protein_cap])
fig_protein.update_layout(
    xaxis_title='Protein (grams)',
    yaxis_title='Count',
    bargap=0.02,
)
fig_protein.show()
fig_protein.write_html("assets/protein_dist.html", include_plotlyjs="cdn")

#Plot: Total Fat

fig_fat=px.histogram(healthy_df, x='total_fat', nbins=200,
                       title='Distribution of Total Fat (Health-Tagged Recipes)')
fig_fat.update_traces(marker_line_color='white', marker_line_width=1.5)
fig_fat.update_xaxes(range=[0, 150])
fig_fat.update_layout(
    xaxis_title='Total Fat',
    yaxis_title='Count',
    bargap=0.05,
)
fig_fat.show()

In [None]:
# Bivariate Analysis
#Calories vs. Minutes (Scatter)
healthy_df=healthy_df[healthy_df['minutes'] <= 2880]
fig = px.scatter(
    recipes_df,
    x='minutes',
    y='calories',
    title='Calories vs. Preparation Time for All Recipes',
    opacity=0.4,
    labels={'minutes': 'Prep Time (minutes)', 'calories': 'Calories'},
    color_discrete_sequence=['#AB63FA']
)
fig.show()

#healthy v. unhealthy by rating
healthy_df=recipes_df[recipes_df['is_health_tagged']==1].copy()
unhealthy_df=recipes_df[recipes_df['is_health_tagged']==0].copy()

# 2. Add a column to indicate the group
healthy_df['group'] = 'Healthy'
unhealthy_df['group'] = 'Unhealthy'

# 3. Combine the data
combined_df=pd.concat([healthy_df, unhealthy_df])

# 4. Plot side-by-side box plots
fig = px.box(
    combined_df,
    x='group',
    y='avg_rating',
    color='group',
    title='Average rating: Healthy vs. Unhealthy',
    labels={'group': 'Recipe Type', 'Average rating': 'Rating per Recipe'},
    color_discrete_map={'Healthy': 'light green', 'Unhealthy': 'pink'}
)

fig.show()
fig.write_html("assets/rating_healthy_v_unhealthy.html", include_plotlyjs="cdn")
#fat healthy v unhealthy 
fig=px.box(
    combined_df,
    x='group',
    y='total_fat',
    color='group',
    title='Fat Content: Healthy vs. Unhealthy',
    labels={'group': 'Recipe Type', 'Fat Content': 'Fat per Recipe'},
    color_discrete_map={'Healthy': 'light green', 'Unhealthy': 'pink'}
)

fig.show()
fig.update_yaxes(range=[0, 500])
fig.show()

In [None]:
#Interesting Aggregates
#What are the most common descriptive tags used for recipes labeled as healthy?
#Most common tags found in healthy tags seen
healthy_tags_df=recipes_df[recipes_df['is_health_tagged'] == 1][['tags']].explode('tags')
#Count the frequency of each tag among healthy recipes
tag_counts=healthy_tags_df.value_counts().reset_index(name='count')
# Get the top 5 most frequent tags
top_healthy_tags = tag_counts.nlargest(5, 'count')
display(top_healthy_tags)

#top ingredients in healthy v unhealthy
healthy_ings = (
    recipes_df[recipes_df['is_health_tagged'] == 1.0]
    .assign(ingredients=lambda df:df['ingredients'].apply(ast.literal_eval))
    .explode('ingredients')['ingredients']
    .value_counts()
    .head(30)
)
unhealthy_ings = (
    recipes_df[recipes_df['is_health_tagged'] == 0.0]
    .assign(ingredients=lambda df: df['ingredients'].apply(ast.literal_eval))
    .explode('ingredients')['ingredients']
    .value_counts()
    .head(30)
)
comparison_df = pd.DataFrame({
    'Healthy': healthy_ings,
    'Unhealthy': unhealthy_ings
})

display(comparison_df)
#average content of different nutritional values in healthy versus unhealthy 
nutrition_cols = ['calories', 'total_fat', 'sugar', 'protein']
recipes_df.groupby('is_health_tagged')[nutrition_cols].mean()
nutrition_cols=['calories', 'total_fat', 'sugar', 'protein']
grouped_nutrition=recipes_df.groupby('is_health_tagged')[nutrition_cols].mean().round(2)
print(grouped_nutrition.reset_index().to_markdown(index=False))

## Step 3: Framing a Prediction Problem

In [None]:
#Predict whether a recipe is labeled as “healthy” based on its ingredients.
'''
I aim to predict a recipe's average user rating using the information about its preperation,
such as the time required (minutes), the number of steps, and the number of ingredients. 
Since the response variable is a numeric score on a continuous scale, this is a regression problem. 
'''


## Step 4: Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score


features=['calories', 'protein']
target='is_health_tagged'
X = recipes_df[features]
y = recipes_df[target]
valid_idx = y.notna()
X=X[valid_idx]
y=y[valid_idx]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
numerical_features = ['calories','protein']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ]
)

baseline_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
])

baseline_pipeline.fit(X_train, y_train)

# 8. Predict and evaluate
y_pred = baseline_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


## Step 5: Final Model

In [272]:
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.ensemble import RandomForestClassifier

features=['calories_per_ingredient', 'protein_per_calorie', 'sugar_per_calorie', 'saturated_fat']
target='is_health_tagged'

#Drop rows with missing data in any feature or label
df=recipes_df.dropna(subset=features + [target])

#Split data
X=df[features]
y=df[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessing: scale or transform different features
preprocessor = ColumnTransformer(transformers=[
    ('standard', StandardScaler(), ['calories_per_ingredient']),
    ('quantile', QuantileTransformer(), ['sugar_per_calorie']),
    ('pass_through', 'passthrough', ['protein_per_calorie', 'saturated_fat'])
])

# Pipeline with classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Grid search for hyperparameters
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10, None]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate
y_pred = grid_search.predict(X_test)
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best hyperparameters: {'classifier__max_depth': None, 'classifier__n_estimators': 50}
F1 Score: 0.05245734657499363
              precision    recall  f1-score   support

         0.0       0.79      0.98      0.87     13189
         1.0       0.26      0.03      0.05      3524

    accuracy                           0.78     16713
   macro avg       0.52      0.50      0.46     16713
weighted avg       0.68      0.78      0.70     16713

