### Import 

In [2]:
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from database import db
from model.dbModels import Recipe

* 'orm_mode' has been renamed to 'from_attributes'


### Load Data 

In [3]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [37]:
from model.dbModels import Recipe
from sqlalchemy.orm import sessionmaker

# Create a connection to PostgreSQL 
engine = create_engine(db.DATABASE_URL)
Session = sessionmaker(bind=engine)
session = Session()

recipes = session.query(Recipe.RecipeId, Recipe.Name, Recipe.Description, 
                        Recipe.RecipeCategory, Recipe.Keywords, 
                        Recipe.RecipeIngredientParts, Recipe.AggregatedRating).all()

df = pd.DataFrame(recipes, columns=["RecipeId", "Name", "Description", 
                                    "RecipeCategory", "Keywords", 
                                    "RecipeIngredientParts", "AggregatedRating"])
df = df.sample(n=100000, random_state=42)

In [38]:
df = df.dropna(subset=["AggregatedRating"])

### Preproccessing function

In [39]:
def preprocess_text(text):
    if pd.isna(text):  
        return ""
    
    text = text.lower()
    tokens = nltk.word_tokenize(text)  

    stop_words = set(stopwords.words("english"))
    punctuation = set(string.punctuation)

    tokens = [word for word in tokens if word not in stop_words and word not in punctuation]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)


### Apply preprocessing to the columns of interest

In [40]:
df["cleaned_ingredient"] = df["RecipeIngredientParts"].apply(preprocess_text)
df["cleaned_category"] = df["RecipeCategory"].apply(preprocess_text)
df["cleaned_description"] = df["Description"].apply(preprocess_text)

### Weighted Features

In [41]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["cleaned_ingredient"] + " " + df["cleaned_description"])

In [42]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [43]:
label_encoder = LabelEncoder()
df["category_encoded"] = label_encoder.fit_transform(df["RecipeCategory"].astype(str))

In [44]:
df["ingredient_length"] = df["RecipeIngredientParts"].apply(lambda x: len(x.split()))


In [45]:
df["relevance"] = df["AggregatedRating"] / df["AggregatedRating"].max()  # Normalize rating
df["relevance"] = df["relevance"].round().astype(int)  # Round to integer labels


features = ["category_encoded", "ingredient_length"]
X = df[features]
y = df["relevance"]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Recommendation Function

In [47]:
ltr_model = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=100,
    learning_rate=0.1,
    importance_type="gain"
)

In [48]:
# Group sizes
group_size = 100
num_groups = len(X_train) // group_size + (1 if len(X_train) % group_size != 0 else 0)
group_sizes = [group_size] * (num_groups - 1)
group_sizes.append(len(X_train) % group_size if len(X_train) % group_size != 0 else group_size)

In [49]:
# Fit the model
ltr_model.fit(X_train, y_train, group=group_sizes)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 278
[LightGBM] [Info] Number of data points in the train set: 41375, number of used features: 2


In [50]:
def get_recommendations_ltr(dish_name, df, ltr_model, top_n=10):
    try:
        idx = df.index[df["Name"] == dish_name].tolist()[0]
        
        # Get feature values for ranking
        candidates = df.drop(idx)  # Exclude the current dish
        X_candidates = candidates[features]
        
        # Predict relevance scores
        candidates["rank_score"] = ltr_model.predict(X_candidates)
        
        # Sort based on predicted ranking
        recommendations = candidates.sort_values(by="rank_score", ascending=False).head(top_n)
        
        return recommendations["Name"].tolist()
    except IndexError:
        return "Dish not found in dataset"

In [51]:
recommendations = get_recommendations_ltr("Pizza", df, ltr_model)
print("Top Recommendations:", recommendations)

Top Recommendations: Dish not found in dataset
