<a href="https://colab.research.google.com/github/Savanth114/ML-PROJECTS/blob/main/3_Cuisine_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 3 : Cuisine Classification

**STEP-1 :** Preprocess the dataset by handling missing values
and encoding categorical variables

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from IPython.display import display

df = pd.read_csv("/content/Dataset.csv")

df = df.dropna(subset=['Cuisines'])

df.fillna('', inplace=True)

df['Cuisines'] = df['Cuisines'].str.lower().str.strip()
df['City'] = df['City'].str.lower().str.strip()
df['Currency'] = df['Currency'].str.upper().str.strip()

df['Primary_Cuisine'] = df['Cuisines'].str.split(',').str[0].str.strip()

target_encoder = LabelEncoder()
df['Cuisine_Label'] = target_encoder.fit_transform(df['Primary_Cuisine'])

label_table = pd.DataFrame({
    'Label': range(len(target_encoder.classes_)),
    'Cuisine Name': target_encoder.classes_
})

print("🔎 Encoded Cuisine Classes:")
display(label_table)

df_model = df[['City', 'Currency', 'Price range', 'Aggregate rating', 'Votes',
               'Rating text', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Cuisine_Label']]
print("\n Final Processed Dataset:")
display(df_model.head())

🔎 Encoded Cuisine Classes:


Unnamed: 0,Label,Cuisine Name
0,0,afghani
1,1,african
2,2,american
3,3,andhra
4,4,arabian
...,...,...
114,114,turkish
115,115,turkish pizza
116,116,vietnamese
117,117,western



 Final Processed Dataset:


Unnamed: 0,City,Currency,Price range,Aggregate rating,Votes,Rating text,Has Online delivery,Is delivering now,Switch to order menu,Cuisine_Label
0,makati city,BOTSWANA PULA(P),3,4.8,314,Excellent,No,No,No,39
1,makati city,BOTSWANA PULA(P),3,4.5,591,Excellent,No,No,No,55
2,mandaluyong city,BOTSWANA PULA(P),4,4.4,270,Very Good,No,No,No,97
3,mandaluyong city,BOTSWANA PULA(P),4,4.9,365,Excellent,No,No,No,55
4,mandaluyong city,BOTSWANA PULA(P),4,4.8,229,Excellent,No,No,No,55


**STEP-2 :** Split the data into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

features = df[['City', 'Currency', 'Price range', 'Aggregate rating', 'Votes',
    'Rating text',  'Has Online delivery', 'Is delivering now', 'Switch to order menu']]
target = df['Cuisine_Label']

class_counts = target.value_counts()
rare_classes = class_counts[class_counts < 2].index

rare_cuisine_names = df[df['Cuisine_Label'].isin(rare_classes)]['Cuisines'].unique()
rare_df = pd.DataFrame({'Rare Cuisines (Support = 1)': rare_cuisine_names})
print(f"📌 Identified {len(rare_classes)} rare classes with only one member:\n")
display(rare_df)

new_rare_label = target.max() + 1
target_modified = target.apply(lambda x: new_rare_label if x in rare_classes else x)

X_train, X_test, y_train, y_test = train_test_split(features, target_modified, test_size=0.5, random_state=42, stratify=target_modified)

print("\nTraining features shape :", X_train.shape)
print("Testing features shape  :", X_test.shape)
print("Training labels shape   :", y_train.shape)
print("Testing labels shape    :", y_test.shape)

print("\nTraining Class Distribution:")
display(y_train.value_counts(normalize=True).head())

print("\nTesting Class Distribution:")
display(y_test.value_counts(normalize=True).head())

📌 Identified 21 rare classes with only one member:



Unnamed: 0,Rare Cuisines (Support = 1)
0,"peruvian, latin american"
1,"gourmet fast food, burger"
2,irish
3,cajun
4,pub food
5,"cuban, spanish"
6,australian
7,"persian, arabian, lebanese, north indian"
8,"tex-mex, american"
9,"malwani, north indian, chinese, seafood"



Training features shape : (4771, 9)
Testing features shape  : (4771, 9)
Training labels shape   : (4771,)
Testing labels shape    : (4771,)

Training Class Distribution:


Unnamed: 0_level_0,proportion
Cuisine_Label,Unnamed: 1_level_1
79,0.313561
27,0.089499
36,0.070425
11,0.065185
23,0.064766



Testing Class Distribution:


Unnamed: 0_level_0,proportion
Cuisine_Label,Unnamed: 1_level_1
79,0.313561
27,0.089709
36,0.070425
11,0.064976
23,0.064557


**STEP-3 :** Select a classification algorithm (logistic
regression) and train it on the
training data.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder

X_combined = pd.concat([X_train, X_test], axis=0)

cat_cols = X_combined.select_dtypes(include='object').columns
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X_combined[col] = le.fit_transform(X_combined[col].astype(str))
    label_encoders[col] = le

X_train_encoded = X_combined.iloc[:len(X_train)].copy()
X_test_encoded = X_combined.iloc[len(X_train):].copy()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train_scaled, y_train)

**STEP-3 :** Select a classification algorithm (random forest) and train it on the training data.

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_encoded, y_train)

**STEP-4 :** Evaluate the model's performance using
appropriate classification metrics (e.g., accuracy,
precision, recall) on the testing data.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

metrics_df = pd.DataFrame({
    'Metric': ['Accuracy','Precision', 'Recall', 'F1-Score'],
    'Score': [accuracy,precision, recall, f1]
})
print("Model Performance Summary:")
display(metrics_df.style.format({'Score': '{:.4f}'}).set_table_attributes("style='display:inline'").hide(axis="index"))

Model Performance Summary:


Metric,Score
Accuracy,0.2475
Precision,0.1841
Recall,0.2475
F1-Score,0.2035


**STEP-5 :**  Analyze the model's performance across different
cuisines and identify any challenges or biases.

In [None]:
import numpy as np
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_encoded)

label_to_cuisine = df[['Cuisine_Label', 'Cuisines']].drop_duplicates().set_index('Cuisine_Label')['Cuisines'].to_dict()

unique_labels = np.unique(y_test)

results = []
for label in unique_labels:
    true_idx = (y_test == label)
    correct = (y_pred[true_idx] == y_test[true_idx]).sum()
    total = true_idx.sum()
    acc = correct / total if total > 0 else 0
    cuisine_name = label_to_cuisine.get(label, f"Label {label}")
    results.append({'Label': label, 'Cuisine': cuisine_name, 'Accuracy': acc, 'Support': total})

accuracy_df = pd.DataFrame(results)

report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

class_metrics = {
    int(label): metrics for label, metrics in report.items() if label.isdigit()
}

metrics_df = pd.DataFrame.from_dict(class_metrics, orient='index')
metrics_df.index.name = 'Label'
metrics_df.reset_index(inplace=True)

full_metrics_df = pd.merge(accuracy_df, metrics_df, on='Label', how='inner')

full_metrics_df = full_metrics_df[['Label', 'Cuisine', 'Accuracy', 'precision', 'recall', 'f1-score', 'Support']]
full_metrics_df = full_metrics_df.sort_values(by='f1-score', ascending=True)
full_metrics_df[['Accuracy', 'precision', 'recall', 'f1-score']] = full_metrics_df[['Accuracy', 'precision', 'recall', 'f1-score']].round(4)

print(" Combined Cuisine-wise Performance Report (Sorted by F1-Score):")
display(full_metrics_df.head(10))


 Combined Cuisine-wise Performance Report (Sorted by F1-Score):


Unnamed: 0,Label,Cuisine,Accuracy,precision,recall,f1-score,Support
0,0,"afghani, north indian, pakistani, arabian",0.0,0.0,0.0,0.0,3
1,1,african,0.0,0.0,0.0,0.0,1
3,3,"andhra, north indian, chinese",0.0,0.0,0.0,0.0,2
4,4,"arabian, north indian",0.0,0.0,0.0,0.0,3
7,8,assamese,0.0,0.0,0.0,0.0,1
6,7,"asian fusion, pub food, fusion, asian, filipin...",0.0,0.0,0.0,0.0,1
10,12,"bar food, modern australian",0.0,0.0,0.0,0.0,5
8,10,"awadhi, north indian",0.0,0.0,0.0,0.0,3
14,16,bihari,0.0,0.0,0.0,0.0,2
12,14,"bengali, chinese",0.0,0.0,0.0,0.0,9
