In [2]:
import pandas as pd
import numpy as np
import re

import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import joblib

<h3> Pre-processing </h3>

In [3]:
def normalize_ingreds(ingredients: list[str]):
    skip_words = ["crushed", "crumbles", "ground", "minced", "powder",
                  "chopped", "sliced", "grilled", "boneless", "skinless", "steamed"]
    
    def remove_verbs(ingredient: str) :
        pattern = "|".join(skip_words)
        return re.sub(pattern, "", ingredient)
    
    def lemmatize(ingredient: str) :
        lemmatizer = WordNetLemmatizer()
        return " ".join([lemmatizer.lemmatize(word.lower()) for word in ingredient.split()])
    
    ingredients = [remove_verbs(ingredient) for ingredient in ingredients]
    ingredients = [lemmatize(ingredient) for ingredient in ingredients]
    ingredients = [re.sub("[^A-Za-z ]", "", ingredient) for ingredient in ingredients]
    ingredients = [re.sub(" +", " ", ingredient) for ingredient in ingredients]
    ingredients = [ingredient.strip().replace(" ", "_") for ingredient in ingredients]
    
    return ", ".join(ingredients)

In [4]:
data = pd.read_json(r"C:\Users\Vichare's Laptop\Downloads\Text Analytics - DSA 5293\Project 2\yummly.json")
data

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [5]:
data["ingredients"] = data["ingredients"].map(normalize_ingreds)
data = data[~data.duplicated(["cuisine", "ingredients"], keep="first")]
data

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"romaine_lettuce, black_olive, grape_tomato, ga..."
1,25693,southern_us,"plain_flour, pepper, salt, tomato, black_peppe..."
2,20130,filipino,"egg, pepper, salt, mayonaise, cooking_oil, gre..."
3,22213,indian,"water, vegetable_oil, wheat, salt"
4,13162,indian,"black_pepper, shallot, cornflour, cayenne_pepp..."
...,...,...,...
39769,29109,irish,"light_brown_sugar, granulated_sugar, butter, w..."
39770,11462,italian,"kraft_zesty_italian_dressing, purple_onion, br..."
39771,2238,irish,"egg, citrus_fruit, raisin, sourdough_starter, ..."
39772,41882,chinese,"chicken_thigh, garlic, white_rice, baking, cor..."


<h3> Linear SVC </h3>

In [6]:
y = data["cuisine"]
x = data.drop(["cuisine"], axis=1)

le = LabelEncoder()
y_transformed = le.fit_transform(data["cuisine"])

x_train, x_test , y_train, y_test = train_test_split(
    x, y_transformed, test_size=0.2)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('vectorizer', TfidfVectorizer(
            ngram_range=(1,1), stop_words="english"), "ingredients")
    ])

lsvc_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('estimator', CalibratedClassifierCV(LinearSVC(C=0.9, penalty='l2')))
])

cross_val_score(lsvc_pipe, x, y_transformed, cv=10)

array([0.78704637, 0.78805444, 0.79763105, 0.79385081, 0.79309476,
       0.78931452, 0.77993446, 0.79228636, 0.7917822 , 0.80060499])

In [8]:
lsvc_pipe.fit(x_train, y_train)
y_pred = lsvc_pipe.predict(x_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.63      0.69       102
           1       0.60      0.46      0.52       163
           2       0.78      0.72      0.75       315
           3       0.80      0.83      0.82       530
           4       0.74      0.68      0.71       164
           5       0.64      0.65      0.64       538
           6       0.81      0.70      0.75       218
           7       0.84      0.91      0.88       574
           8       0.71      0.50      0.59       132
           9       0.79      0.90      0.84      1550
          10       0.89      0.75      0.81       103
          11       0.87      0.68      0.76       293
          12       0.83      0.77      0.80       155
          13       0.91      0.92      0.91      1308
          14       0.80      0.77      0.78       153
          15       0.67      0.49      0.56        94
          16       0.71      0.78      0.75       864
   