In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv(r"Dataset .csv")

In [35]:
df = df.dropna(subset=['Cuisines'])

In [36]:
df['Primary_Cuisine'] = df['Cuisines'].apply(lambda x: x.split(',')[0].strip())

In [37]:
cuisine_counts = df['Primary_Cuisine'].value_counts()
valid_cuisines = cuisine_counts[cuisine_counts >= 100].index
df = df[df['Primary_Cuisine'].isin(valid_cuisines)]

In [38]:
label_encoder = LabelEncoder()
df['Cuisine_Label'] = label_encoder.fit_transform(df['Primary_Cuisine'])

In [39]:
FEATURE_COLUMNS = [
    'Country Code', 'Average Cost for two', 'Price range', 'Aggregate rating',
    'Has Table booking', 'Has Online delivery', 'Is delivering now', 'City'
]

In [40]:
df['Has Table booking'] = df['Has Table booking'].map({'Yes': 1, 'No': 0})
df['Has Online delivery'] = df['Has Online delivery'].map({'Yes': 1, 'No': 0})
df['Is delivering now'] = df['Is delivering now'].map({'Yes': 1, 'No': 0})

In [41]:
X = df[FEATURE_COLUMNS].copy()
y = df['Cuisine_Label']

In [42]:
numeric_features = ['Country Code', 'Average Cost for two', 'Price range', 'Aggregate rating']
binary_features = ['Has Table booking', 'Has Online delivery', 'Is delivering now']
categorical_features = ['City']

In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('bin', 'passthrough', binary_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [44]:
ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=1000, class_weight='balanced')),
    ('rf', RandomForestClassifier(class_weight='balanced')),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', scale_pos_weight=1))
], voting='soft')

In [45]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ensemble_model)
])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [47]:
pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [48]:
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.38971906754333535


In [49]:
joblib.dump(pipeline, "pipeline_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']