In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.colors as pc
import plotly.express as px
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
train_df = pd.read_json("../data/C2/train.json")
train_df

In [None]:
train_df.info()

In [None]:
def plot_counting_graph(column, counted_value):
    class_counts = column.value_counts().reset_index()
    class_counts.columns = [counted_value, 'Count']


    fig = px.bar(class_counts, x=counted_value, y='Count', color=counted_value, color_discrete_sequence=pc.qualitative.Prism, title=f'{counted_value} Distribution')
    fig.show()

In [None]:
plot_counting_graph(train_df['cuisine'], "Cuisine")

In [None]:
train_df["ingredients"].apply(len).describe()

In [None]:
plot_counting_graph(train_df["ingredients"].apply(len), "Ingredient Amount")

In [None]:
filtered_df = train_df[train_df['ingredients'].apply(len).isin([1, 2, 3])].head(20)
filtered_df

In [None]:
sampled_values = train_df.sample(n=4, random_state=1).to_dict(orient="records")
sampled_values

In [None]:
cuisines = [item["cuisine"] for item in sampled_values]
ingredients = [" ".join(item["ingredients"]) for item in sampled_values]

embeddings = list(model.encode(cuisines)) + list(model.encode(ingredients))

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

cuisine_points = reduced_embeddings[:len(cuisines)]
ingredient_points = reduced_embeddings[len(cuisines):]


fig = go.Figure()

fig.add_trace(go.Scatter(
    x=cuisine_points[:, 0], y=cuisine_points[:, 1],
    mode='markers+text',
    marker=dict(size=15, color='blue', symbol='circle'),
    text=cuisines,
    textposition='top center',
    name='Cuisines'
))

fig.add_trace(go.Scatter(
    x=ingredient_points[:, 0], y=ingredient_points[:, 1],
    mode='markers+text',
    marker=dict(size=12, color='red', symbol='diamond'),
    text=[f"Ingredients for {cuisine}" for cuisine in cuisines],
    textposition='bottom center',
    name='Ingredients'
))

fig.update_layout(
    title="Semantic Embedding Visualization",
    xaxis_title="PCA Component 1",
    yaxis_title="PCA Component 2",
    width=700,
    height=500
)

fig.show()