In [5]:

!pip install --quiet category_encoders scikit-multilearn lightgbm catboost  # optional

import pandas as pd
import numpy as np
import io, os
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
%matplotlib inline


In [6]:
# upload or mount Drive then set file_path accordingly
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), engine='python', on_bad_lines='skip')
    print("Loaded CSV")
except Exception as e:
    print("CSV failed:", e, "Trying read_excel")
    df = pd.read_excel(io.BytesIO(uploaded[file_name]))
    print("Loaded Excel")

print("Shape:", df.shape)
display(df.head())


Saving Dataset  (2).csv to Dataset  (2).csv
CSV failed: 'utf-8' codec can't decode byte 0xe3 in position 14: invalid continuation byte Trying read_excel
Loaded Excel
Shape: (9551, 21)


Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [7]:
# inspect cuisines column
c_col = "Cuisines"
if c_col not in df.columns:
    raise ValueError(f"Column '{c_col}' not found. Check the dataset column names: {df.columns.tolist()}")

print("Sample cuisines values:")
display(df[c_col].astype(str).head(20))


sample = df[c_col].dropna().astype(str).sample(min(500, len(df)), random_state=42)
has_commas = sample.apply(lambda s: ',' in s).mean()
print(f"Fraction of sampled rows containing comma: {has_commas:.3f}")

if has_commas > 0.1:
    print("Likely multi-label (many rows contain multiple cuisines). Use MultiLabel approach.")
    multi_label = True
else:
    print("Likely single-label (one cuisine per row). Use single-label approach.")
    multi_label = False


Sample cuisines values:


Unnamed: 0,Cuisines
0,"French, Japanese, Desserts"
1,Japanese
2,"Seafood, Asian, Filipino, Indian"
3,"Japanese, Sushi"
4,"Japanese, Korean"
5,Chinese
6,"Asian, European"
7,"Seafood, Filipino, Asian, European"
8,"European, Asian, Indian"
9,Filipino


Fraction of sampled rows containing comma: 0.648
Likely multi-label (many rows contain multiple cuisines). Use MultiLabel approach.


In [8]:
#  normalize
def normalize_cui(s):
    if pd.isna(s):
        return ""
    s = str(s).strip()

    s = s.replace(';', ',').replace('|', ',')
    # remove repeated spaces
    s = " ".join(s.split())
    return s

df[c_col] = df[c_col].apply(normalize_cui)
# drop rows with empty cuisines
df = df[df[c_col] != ""].reset_index(drop=True)
print("After drop empty cuisines, shape:", df.shape)


After drop empty cuisines, shape: (9542, 21)


In [9]:
# create y labels
if multi_label:

    df['cuis_list'] = df[c_col].apply(lambda s: [x.strip().lower() for x in s.split(',') if x.strip()!=''])
    # keep only top-K frequent cuisines to limit output size
    from collections import Counter
    all_cuis = Counter([c for row in df['cuis_list'] for c in row])
    print("Unique cuisines (total):", len(all_cuis))
    top_k = 20
    top_labels = set([x for x,_ in all_cuis.most_common(top_k)])
    print("Top labels:", top_labels)
    # map rare labels to 'other' OR drop them from label list
    def keep_top(row):
        return [c for c in row if c in top_labels]
    df['cuis_list_top'] = df['cuis_list'].apply(keep_top)
    df = df[df['cuis_list_top'].map(len) > 0].reset_index(drop=True)
    mlb = MultiLabelBinarizer(classes=sorted(list(top_labels)))
    Y = mlb.fit_transform(df['cuis_list_top'])
    label_names = mlb.classes_
    print("Label matrix shape:", Y.shape)
else:
    # single-label: pick the first cuisine if comma exists
    df['cuis_single'] = df[c_col].apply(lambda s: s.split(',')[0].strip().lower())
    top_k = 20
    top = df['cuis_single'].value_counts().nlargest(top_k).index.tolist()
    df['cuis_single_top'] = df['cuis_single'].apply(lambda x: x if x in top else 'other')
    le = LabelEncoder()
    Y = le.fit_transform(df['cuis_single_top'])
    label_names = le.classes_
    print("Unique classes used:", len(label_names), label_names)


Unique cuisines (total): 145
Top labels: {'street food', 'american', 'ice cream', 'burger', 'north indian', 'beverages', 'mithai', 'cafe', 'bakery', 'south indian', 'mexican', 'italian', 'asian', 'thai', 'mughlai', 'desserts', 'chinese', 'fast food', 'continental', 'pizza'}
Label matrix shape: (8913, 20)


In [10]:
# feature engineering
num_cols = ['Average Cost for two', 'Votes', 'Longitude', 'Latitude', 'Price range', 'Country Code']
cat_cols = ['Currency', 'Has Table booking', 'Has Online delivery', 'Rating text']  # adjust to actual

# Keep only columns that exist
num_cols = [c for c in num_cols if c in df.columns]
cat_cols = [c for c in cat_cols if c in df.columns]

# Frequency-encode high-card columns
def freq_encode(series):
    s = series.fillna("___missing___").astype(str)
    freqs = s.value_counts(normalize=True)
    return s.map(freqs).astype(float)

X = pd.DataFrame()
# numeric
for c in num_cols:
    X[c] = pd.to_numeric(df[c], errors='coerce').fillna(df[c].median() if df[c].dtype!='object' else 0)

# categorical freq encode or one-hot small ones
for c in cat_cols:
    nunique = df[c].nunique(dropna=False)
    if nunique > 25:
        X[c + "_freq"] = freq_encode(df[c])
    else:
        # one hot small cardinality
        dummies = pd.get_dummies(df[c].fillna('___missing___').astype(str), prefix=c)
        X = pd.concat([X, dummies], axis=1)


if 'Restaurant Name' in df.columns:
    tfv = TfidfVectorizer(max_features=500, ngram_range=(1,2))
    name_tfidf = tfv.fit_transform(df['Restaurant Name'].astype(str))

    tfidf_df = pd.DataFrame(name_tfidf.toarray()[:, :100], columns=[f"nm_tfidf_{i}" for i in range(100)])
    X = pd.concat([X.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

print("Feature matrix shape:", X.shape)
display(X.head())


Feature matrix shape: (8913, 128)


Unnamed: 0,Average Cost for two,Votes,Longitude,Latitude,Price range,Country Code,Currency_Botswana Pula(P),Currency_Brazilian Real(R$),Currency_Dollar($),Currency_Emirati Diram(AED),...,nm_tfidf_90,nm_tfidf_91,nm_tfidf_92,nm_tfidf_93,nm_tfidf_94,nm_tfidf_95,nm_tfidf_96,nm_tfidf_97,nm_tfidf_98,nm_tfidf_99
0,1100,314,121.027535,14.565443,3,162,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4000,270,121.056831,14.581404,4,162,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1000,336,121.056314,14.583764,3,162,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2000,520,120.979667,14.531333,4,162,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2000,677,120.979333,14.54,4,162,True,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#  split
if multi_label:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)
print("Train shape:", X_train.shape, y_train.shape, "Test shape:", X_test.shape, y_test.shape)


Train shape: (7130, 128) (7130, 20) Test shape: (1783, 128) (1783, 20)


In [12]:
#  define & train models
results = {}

# Logistic Regression
if multi_label:
    lr = OneVsRestClassifier(LogisticRegression(max_iter=500, class_weight='balanced'))
    lr.fit(X_train, y_train)
    preds_lr = lr.predict(X_test)
    results['LR'] = {'preds': preds_lr, 'model': lr}
else:
    lr = LogisticRegression(max_iter=500, class_weight='balanced')
    lr.fit(X_train, y_train)
    preds_lr = lr.predict(X_test)
    results['LR'] = {'preds': preds_lr, 'model': lr}

# Random Forest
if multi_label:
    rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, n_jobs=1, class_weight=None, random_state=42))
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_test)
    results['RF'] = {'preds': preds_rf, 'model': rf}
else:
    rf = RandomForestClassifier(n_estimators=200, n_jobs=1, random_state=42, class_weight='balanced')
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_test)
    results['RF'] = {'preds': preds_rf, 'model': rf}

print("Trained models.")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Trained models.


In [13]:
# evaluation for single-label
if not multi_label:
    for name, res in results.items():
        y_pred = res['preds']
        print(f"\nModel: {name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Precision (macro):", precision_score(y_test, y_pred, average='macro', zero_division=0))
        print("Recall (macro):", recall_score(y_test, y_pred, average='macro', zero_division=0))
        print("F1 (macro):", f1_score(y_test, y_pred, average='macro', zero_division=0))
        print("\nClassification Report:\n")
        print(classification_report(y_test, y_pred, zero_division=0, target_names=label_names))
        print("Confusion matrix:")
        cm = confusion_matrix(y_test, y_pred)
        display(pd.DataFrame(cm, index=label_names, columns=label_names))


In [14]:
#  evaluation for multi-label
if multi_label:
    from sklearn.metrics import average_precision_score, roc_auc_score
    for name, res in results.items():
        y_pred = res['preds']
        # micro/macro averaged metrics
        print(f"\nModel: {name}")
        print("Precision (micro):", precision_score(y_test, y_pred, average='micro', zero_division=0))
        print("Recall (micro):", recall_score(y_test, y_pred, average='micro', zero_division=0))
        print("F1 (micro):", f1_score(y_test, y_pred, average='micro', zero_division=0))
        print("Precision (macro):", precision_score(y_test, y_pred, average='macro', zero_division=0))
        print("Recall (macro):", recall_score(y_test, y_pred, average='macro', zero_division=0))
        print("F1 (macro):", f1_score(y_test, y_pred, average='macro', zero_division=0))
        # Per-class metrics:
        print("\nPer-class report (precision, recall, f1):")
        for i, lbl in enumerate(label_names):
            p = precision_score(y_test[:,i], y_pred[:,i], zero_division=0)
            r = recall_score(y_test[:,i], y_pred[:,i], zero_division=0)
            f = f1_score(y_test[:,i], y_pred[:,i], zero_division=0)
            print(f"{lbl:20s}  P:{p:.3f}  R:{r:.3f}  F1:{f:.3f}")



Model: LR
Precision (micro): 0.19542857142857142
Recall (micro): 0.6574977817213842
F1 (micro): 0.30130116562754133
Precision (macro): 0.18715747996286908
Recall (macro): 0.7183822128659739
F1 (macro): 0.26680200774908946

Per-class report (precision, recall, f1):
american              P:0.117  R:0.861  F1:0.206
asian                 P:0.097  R:0.745  F1:0.172
bakery                P:0.224  R:0.546  F1:0.318
beverages             P:0.065  R:0.717  F1:0.119
burger                P:0.079  R:0.826  F1:0.145
cafe                  P:0.224  R:0.756  F1:0.346
chinese               P:0.439  R:0.593  F1:0.504
continental           P:0.303  R:0.826  F1:0.443
desserts              P:0.115  R:0.646  F1:0.196
fast food             P:0.324  R:0.677  F1:0.439
ice cream             P:0.048  R:0.900  F1:0.092
italian               P:0.223  R:0.713  F1:0.340
mexican               P:0.053  R:0.833  F1:0.099
mithai                P:0.147  R:0.728  F1:0.244
mughlai               P:0.209  R:0.692  F1:0.321

In [15]:
# quick analysis of per-cuisine performance and class imbalance
if multi_label:
    # compute support
    support = Y.sum(axis=0) if 'Y' in globals() else None
    print("Support per label (how many instances in whole dataset):")
    for lbl, sup in zip(label_names, support):
        print(lbl, int(sup))
else:
    # class counts
    counts = pd.Series(Y).value_counts()
    print("Class counts (encoded labels):")
    display(pd.DataFrame({'class': label_names, 'count': [counts.get(i,0) for i in range(len(label_names))]}))

# Typical bias checks to document:
print("\nBias / challenge checklist (document these in your report):")
print("- Class imbalance: small classes have poor recall.")
print("- Popularity bias: common cuisines in dataset dominate predictions.")
print("- Geographic bias: cuisines over/under-represented by city/country.")
print("- Label noise: mixed or multi-meaning cuisine names (e.g., 'Asian') may confuse model.")
print("- Missing or sparse features: missing 'Cuisines' in some rows, or many features absent.")


Support per label (how many instances in whole dataset):
american 390
asian 233
bakery 744
beverages 228
burger 251
cafe 703
chinese 2733
continental 736
desserts 653
fast food 1986
ice cream 226
italian 764
mexican 181
mithai 380
mughlai 994
north indian 3960
pizza 381
south indian 636
street food 562
thai 234

Bias / challenge checklist (document these in your report):
- Class imbalance: small classes have poor recall.
- Popularity bias: common cuisines in dataset dominate predictions.
- Geographic bias: cuisines over/under-represented by city/country.
- Label noise: mixed or multi-meaning cuisine names (e.g., 'Asian') may confuse model.
- Missing or sparse features: missing 'Cuisines' in some rows, or many features absent.


In [17]:
# suggestions to improve model
# Use class weighting (we used class_weight='balanced' for LogisticRegression and RandomForest where applicable).
# Over/under-sampling (SMOTE) for single-label rare classes, or use stratified sampling.
# For multi-label, consider threshold tuning per-class to balance precision/recall.
# Use CatBoost / LightGBM which handle categorical features and imbalance better.
# Use target/mean encoding for high-card columns (with cross-validation to avoid leakage).
# Enrich features: parse 'Cuisines' text more (split to multi-hot of top cuisine tokens), add location clustering, textual features from reviews (TF-IDF or embeddings).
# Use hierarchical classification: first predict high-level cuisine group (e.g., "Asian", "European") then sub-class.

In [18]:
#  save model and label encoders / mlb
import joblib
best_model = results['LR']['model'] if 'LR' in results else list(results.values())[0]['model']
joblib.dump(best_model, "/content/cuisine_model.pkl")

if multi_label:
    joblib.dump(mlb, "/content/mlb_labels.pkl")
else:
    joblib.dump(le, "/content/label_encoder.pkl")

print("Saved model and encoders to /content/")
files.download("/content/cuisine_model.pkl")


Saved model and encoders to /content/


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>