In [None]:
import plotly.graph_objects as go
import ast
import re
import numpy as np
import json
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, classification_report

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def plot_confusion_matrix(cf_matrix_df):

    fig = go.Figure(data=go.Heatmap(
    z=cf_matrix_df.values,
    x=cf_matrix_df.columns,
    y=cf_matrix_df.index,
    colorscale='Viridis',
    colorbar=dict(title='Count'),
    zmin=0,
    zmax=cf_matrix_df.values.max(),
    hoverongaps=False
    ))

    fig.update_layout(
        title='Confusion Matrix',
        xaxis=dict(title='Predicted Label'),
        yaxis=dict(title='True Label', autorange='reversed'),
        width=800,
        height=800
    )

    fig.show()

## Cosine Similarity

In [None]:
""" train_df = pd.read_json("../data/C2/train.json")
train_df

cuisine_embeddings = {}
for cuisine in train_df["cuisine"].unique().tolist():
    cuisine_embeddings[cuisine] = list(model.encode(f"{cuisine} cuisine")) """

In [None]:
def find_best_matching_cuisine(cuisine_embeddings, ingredient_list):
    ingredient_embedding = model.encode("A cuisine with dishes that uses ingredients such as: " + " ".join(ingredient_list))

    similarity_dict = {}
    for cuisine, embedding in cuisine_embeddings.items():
        similarity_dict[cuisine] = util.cos_sim(ingredient_embedding, embedding)[0]
    
    return max(similarity_dict, key=similarity_dict.get)

def make_predictions_df(df, cuisine_embeddings):

    rows = df.to_dict(orient="records")

    for i in tqdm(range(len(rows))):
        rows[i]["predicted_cuisine"] = find_best_matching_cuisine(cuisine_embeddings, rows[i]["ingredients"])

    return pd.DataFrame(rows)

In [None]:
#predictions_df = make_predictions_df(train_df, cuisine_embeddings)
#predictions_df.to_csv("../data/C2/cos_similarity_predictions.csv", index=False)
predictions_df = pd.read_csv("../data/C2/cos_similarity_predictions.csv")
predictions_df['ingredients'] = predictions_df['ingredients'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [None]:
predictions_df["predicted_cuisine"].value_counts()

In [None]:
print(classification_report(predictions_df['cuisine'], predictions_df['predicted_cuisine'], target_names=predictions_df['cuisine'].unique()))

In [None]:
cos_similarity_cm = confusion_matrix(predictions_df['cuisine'], predictions_df['predicted_cuisine'], labels=predictions_df['cuisine'].unique(), normalize='true')

cm_df = pd.DataFrame(cos_similarity_cm, 
                     index=[f"{cuisine}" for cuisine in predictions_df['cuisine'].unique()],
                     columns=[f"{cuisine}" for cuisine in predictions_df['cuisine'].unique()])

plot_confusion_matrix(cm_df)

### Cajun Creole Arc

In [None]:
#predictions_df.to_csv("../data/C2/cajun_creole_prediction.csv")
predictions_df = pd.read_csv("../data/C2/cajun_creole_prediction.csv")

In [None]:
predictions_df

In [None]:
def plot_embedding_points(sampled_values):
    cuisines = [item["cuisine"] for item in sampled_values]
    ingredients = ["A cuisine with dishes that uses ingredients such as: " + " ".join(item["ingredients"]) for item in sampled_values]

    embeddings = list(model.encode(cuisines)) + list(model.encode(ingredients))

    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)

    cuisine_points = reduced_embeddings[:len(cuisines)]
    ingredient_points = reduced_embeddings[len(cuisines):]


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=cuisine_points[:, 0], y=cuisine_points[:, 1],
        mode='markers+text',
        marker=dict(size=15, color='blue', symbol='circle'),
        text=cuisines,
        textposition='top center',
        name='Cuisines'
    ))

    fig.add_trace(go.Scatter(
        x=ingredient_points[:, 0], y=ingredient_points[:, 1],
        mode='markers+text',
        marker=dict(size=12, color='red', symbol='diamond'),
        text=[f"Ingredients for {cuisine}" for cuisine in cuisines],
        textposition='bottom center',
        name='Ingredients'
    ))

    fig.update_layout(
        title="Semantic Embedding Visualization",
        xaxis_title="PCA Component 1",
        yaxis_title="PCA Component 2",
        width=700,
        height=500
    )

    fig.show()

In [None]:
creoles = predictions_df.query("cuisine == 'cajun_creole'").head(3).to_dict(orient="records")
sampled_values = predictions_df.head(10).to_dict(orient="records")

sampled_values.extend(creoles)

cajun_creole_embedding = model.encode("cajun_creole")
for value in sampled_values:
    cuisine_embedding = model.encode(value["cuisine"])
    ingredients_embedding = model.encode("A cuisine with dishes that uses ingredients such as: " + " ".join(value["ingredients"]))
    print(f"Cosine Similarity for {value["cuisine"]}: {util.cos_sim(cuisine_embedding, ingredients_embedding)} vs for cajun_creole: {util.cos_sim(cajun_creole_embedding,ingredients_embedding)}")

plot_embedding_points(sampled_values)

In [None]:
predictions_df["predicted_cuisine"].value_counts()

## Training-Test Models

In [None]:
""" rows = pd.read_json("../data/C2/train.json").to_dict(orient="records")
for row in tqdm(rows):
    embedding = model.encode(" ".join(row["ingredients"]))
    row["embedded_ingredients"] = json.dumps(embedding.tolist())

pd.DataFrame(rows).to_csv("../data/C2/train_with_embeddings.csv")

rows = pd.read_json("../data/C2/test.json").to_dict(orient="records")
for row in tqdm(rows):
    embedding = model.encode(" ".join(row["ingredients"]))
    row["embedded_ingredients"] = json.dumps(embedding.tolist())

pd.DataFrame(rows).to_csv("../data/C2/test_with_embeddings.csv") """

In [None]:
train_df_with_embeddings = pd.read_csv("../data/C2/train_with_embeddings.csv")
train_df_with_embeddings['embedded_ingredients'] = train_df_with_embeddings['embedded_ingredients'].apply(json.loads)

In [None]:
def evaluate_predictions(df, model=1, random_state=None):
    X = np.vstack(df['embedded_ingredients'].values)
    y = df["cuisine"].tolist()

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=random_state, stratify=y)
    
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    if model == 1:
        clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
    elif model == 2:
        clf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=random_state)
    elif model == 3:
        clf = xgb.XGBClassifier(
                objective='multi:softmax',
                num_class=len(label_encoder.classes_),
                eval_metric='mlogloss',
                max_depth=6,
                n_estimators=100,
                random_state=random_state
            )    
    else:
        clf = lgb.LGBMClassifier(
            objective='multiclass',
            num_class=len(label_encoder.classes_),
            learning_rate=0.1,
            n_estimators=100,
            max_depth=6,
            random_state=random_state,
            verbose=-1
        )
    
    clf.fit(X_train_res, y_train_res)


    y_pred = clf.predict(X_test)
    cf_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), index=label_encoder.classes_, columns=label_encoder.classes_)

    return cf_matrix_df, classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [None]:
logistic_regression_cf, logistic_regression_report = evaluate_predictions(train_df_with_embeddings, model=1)
print(logistic_regression_report)

In [None]:
plot_confusion_matrix(logistic_regression_cf)

In [None]:
random_forest_cf, random_forest_report = evaluate_predictions(train_df_with_embeddings, model=2)
print(random_forest_report)

In [None]:
plot_confusion_matrix(random_forest_cf)

In [None]:
xgboost_cf, xgboost_report = evaluate_predictions(train_df_with_embeddings, model=3)
print(xgboost_report)

In [None]:
plot_confusion_matrix(xgboost_cf)

In [None]:
lightgbm_cf, lightgbm_report = evaluate_predictions(train_df_with_embeddings, model=4)
print(lightgbm_report)

In [None]:
plot_confusion_matrix(lightgbm_cf)

## TF-IDF

In [None]:
df = pd.read_json("../data/C2/train.json")
ingredients_set = set().union(*df["ingredients"])
len(ingredients_set)

In [None]:
print(util.cos_sim(model.encode("chopped onion"), model.encode("onion")))
print(util.cos_sim(model.encode("onion soup"), model.encode("onion")))
print(util.cos_sim(model.encode("whole milk"), model.encode("milk")))

In [None]:
""" ingredient_encoding = {}
for ingredient in tqdm(ingredients_set):
    ingredient_encoding[ingredient] = np.array(model.encode(ingredient).tolist()) """

In [None]:
""" ingredient_substitution = {}

ingredients = list(ingredients_set)
for i in tqdm(range(len(ingredients))):

    if ingredients[i] not in ingredients_set:
        continue

    ingredient_substitution[ingredients[i]] = [ingredients[i]]

    for j in range(i+1, len(ingredients)):
        if util.cos_sim(ingredient_encoding[ingredients[i]], ingredient_encoding[ingredients[j]]) > 0.79:
            ingredient_substitution[ingredients[i]].append(ingredients[j])

            ingredients_set.discard(ingredients[j]) """

In [None]:
""" with open("../data/C2/ingredient_sub.json", "w") as file:
    json.dump(ingredient_substitution, file) """

with open("../data/C2/ingredient_sub.json", 'r') as file:
    ingredient_substitution = json.load(file)

In [None]:
len(ingredient_substitution)

In [None]:
reverse_map = {v: k for k, vals in ingredient_substitution.items() for v in vals}
df["ingredients"] = df['ingredients'].apply(lambda lst: [reverse_map.get(item, item) for item in lst])
df["ingredients"] = df['ingredients'].apply(lambda lst: [re.sub(r'\s{2,}', '_', token.strip()) for token in lst])

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['cuisine'])


cuisine_documents = train_df.groupby('cuisine')['ingredients'].apply(
    lambda lists: ' '.join([token for sublist in lists for token in sublist])
)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cuisine_documents)

train_tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    index=cuisine_documents.index,
    columns=vectorizer.get_feature_names_out()
)

In [None]:
def classify_ingredients(ingredient_list, vectorizer, tfidf_matrix, label_index, ingredient_substitution):
    reverse_map = {v: k for k, vals in ingredient_substitution.items() for v in vals}

    # Preprocess tokens: replace whitespace with underscores
    cleaned_tokens = [reverse_map.get(item, item) for item in ingredient_list]
    cleaned_tokens = [re.sub(r'\s+', '_', token.strip()) for token in ingredient_list]

    doc = ' '.join(cleaned_tokens)
    doc_vector = vectorizer.transform([doc])

    similarities = cosine_similarity(doc_vector, tfidf_matrix)

    best_label_idx = similarities.argmax()
    best_label = label_index[best_label_idx]

    return best_label, similarities[0][best_label_idx]

In [None]:
label_index = cuisine_documents.index

correct = 0
total = 0

rows = test_df.to_dict(orient="records")
for i in range(len(rows)):
    pred_label, _ = classify_ingredients(rows[i]['ingredients'], vectorizer, tfidf_matrix, label_index, ingredient_substitution)
    rows[i]['predicted_cuisine'] = pred_label

predictions_df = pd.DataFrame(rows)
print(classification_report(predictions_df['cuisine'], predictions_df['predicted_cuisine'], target_names=predictions_df['cuisine'].unique()))

In [None]:
predictions_df = pd.DataFrame(rows)
cos_similarity_cm = confusion_matrix(predictions_df['cuisine'], predictions_df['predicted_cuisine'], labels=predictions_df['cuisine'].unique(), normalize='true')

cm_df = pd.DataFrame(cos_similarity_cm, 
                     index=[f"{cuisine}" for cuisine in predictions_df['cuisine'].unique()],
                     columns=[f"{cuisine}" for cuisine in predictions_df['cuisine'].unique()])

plot_confusion_matrix(cm_df)