In [None]:
import pandas as pd
from google.colab import files

# Upload CSV file manually in Colab
uploaded = files.upload()



Saving thailand_foods.csv to thailand_foods.csv


In [None]:

# Load the dataset
df = pd.read_csv("thailand_foods.csv")

# Remove rows where ingredients are "Unknown"
df_cleaned = df[df["ingredients"] != "Unknown"].copy()

# Keep only necessary columns
df_cleaned = df_cleaned[["ingredients", "th_name"]]

# Show dataset after cleaning
df_cleaned.head()

Unnamed: 0,ingredients,th_name
0,Beef+lime juice+fish sauce+chili+herbs,ก้อย
1,Pork belly+rice noodles+soy sauce+garlic,กวยจั๊บ
2,Rice noodles+pork+bean sprouts+garlic+fish sauce,ก๋วยเตี๋ยว
3,Beef+noodles+curry powder+coconut milk,ก๋วยเตี๋ยวแขก
4,Chicken+noodles+soy sauce+garlic,ก๋วยเตี๋ยวคั่วไก่


In [None]:
# Define dish grouping
dish_mapping = {
    "ก๋วยเตี๋ยวเรือ": "Noodle Dish",
    "เย็นตาโฟ": "Noodle Dish",
    "ก๋วยจั๊บ": "Noodle Dish",
    "แกงเขียวหวาน": "Curry Dish",
    "พะแนง": "Curry Dish",
    "แกงมัสมั่น": "Curry Dish",
    "ส้มตำ": "Spicy Salad",
    "ยำทะเล": "Spicy Salad",
    "ลาบหมู": "Spicy Salad",
    "ผัดกะเพรา": "Fried Dish",
    "ไข่เจียว": "Fried Dish",
    "ทอดมันปลา": "Fried Dish",
}

# Apply mapping
df_cleaned["th_name"] = df_cleaned["th_name"].map(dish_mapping).fillna(df_cleaned["th_name"])

# Check unique categories
df_cleaned["th_name"].value_counts()


Unnamed: 0_level_0,count
th_name,Unnamed: 1_level_1
Curry Dish,3
โรตี,2
ข้าวจี่,2
Noodle Dish,2
Fried Dish,2
...,...
แกงสะแล,1
แกงเหลือง,1
แกงฮังเล,1
ไก่ห่อใบเตย,1


In [None]:
import random

def augment_ingredients(ingredients):
    ingredients_list = ingredients.split("+")
    random.shuffle(ingredients_list)
    return "+".join(ingredients_list)

# Apply augmentation to generate more training samples
df_cleaned["augmented_ingredients"] = df_cleaned["ingredients"].apply(augment_ingredients)

# Create a new DataFrame with augmented ingredients
df_augmented = df_cleaned.copy()
df_augmented["ingredients"] = df_augmented["augmented_ingredients"]
df_augmented.drop(columns=["augmented_ingredients"], inplace=True)

# Reset index before concatenation
df_cleaned.reset_index(drop=True, inplace=True)
df_augmented.reset_index(drop=True, inplace=True)

# Append augmented data to the original dataset
df_expanded = pd.concat([df_cleaned, df_augmented], ignore_index=True)

# Check the new dataset
df_expanded.head()


Unnamed: 0,ingredients,th_name,augmented_ingredients
0,Beef+lime juice+fish sauce+chili+herbs,ก้อย,herbs+chili+lime juice+Beef+fish sauce
1,Pork belly+rice noodles+soy sauce+garlic,กวยจั๊บ,Pork belly+rice noodles+soy sauce+garlic
2,Rice noodles+pork+bean sprouts+garlic+fish sauce,ก๋วยเตี๋ยว,bean sprouts+garlic+pork+fish sauce+Rice noodles
3,Beef+noodles+curry powder+coconut milk,ก๋วยเตี๋ยวแขก,noodles+coconut milk+curry powder+Beef
4,Chicken+noodles+soy sauce+garlic,ก๋วยเตี๋ยวคั่วไก่,noodles+soy sauce+Chicken+garlic


In [None]:
from gensim.models import Word2Vec
import numpy as np

# Prepare data for Word2Vec
ingredient_lists = [ing.split("+") for ing in df_expanded["ingredients"]]

# Train Word2Vec Model
word2vec = Word2Vec(sentences=ingredient_lists, vector_size=100, window=5, min_count=1, workers=4)

# Convert Ingredients into Numerical Features using Word2Vec
def vectorize_ingredients(ingredients):
    words = ingredients.split("+")
    vectors = [word2vec.wv[word] for word in words if word in word2vec.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X = np.array([vectorize_ingredients(ing) for ing in df_expanded["ingredients"]])
y = df_expanded["th_name"]


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Convert dish names (Thai text) to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert dish names to numbers

# Count occurrences of each class
class_counts = np.bincount(y_encoded)

# Find rare classes that might get lost during train-test split
rare_classes = np.where(class_counts < 2)[0]  # Classes with fewer than 2 examples

# Ensure all classes exist in both train and test sets
train_indices = []
test_indices = []

for label in set(y_encoded):
    indices = np.where(y_encoded == label)[0]

    if len(indices) > 1:
        # If enough samples, split normally
        train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
        train_indices.extend(train_idx)
        test_indices.extend(test_idx)
    else:
        # If only one sample, force it into the training set
        train_indices.extend(indices)

# Convert to NumPy arrays
train_indices = np.array(train_indices)
test_indices = np.array(test_indices)

# Now split X and y
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y_encoded[train_indices], y_encoded[test_indices]

# Ensure all labels exist in both sets
print("Unique classes in train set:", len(set(y_train)))
print("Unique classes in test set:", len(set(y_test)))

# Train XGBoost Classifier
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Convert predictions back to dish names
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Evaluate model accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Improved Model Accuracy: {accuracy:.2%}")


Unique classes in train set: 219
Unique classes in test set: 219
✅ Improved Model Accuracy: 79.09%


In [None]:
import joblib
from google.colab import files

# Save trained model & encoder
joblib.dump(model, "thai_dish_model_final.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

# Download files for use in Streamlit
files.download("thai_dish_model_final.pkl")
files.download("label_encoder.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from gensim.models import Word2Vec
from google.colab import files

# Save the trained Word2Vec model
word2vec.save("word2vec_model.pkl")

# Download the model to your local machine
files.download("word2vec_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>