### Task 3

#### Objective: Develop a machine learning model to classify restaurants based on their cuisines.

#### Steps:

1. Preprocess the dataset by handling missing values and encoding categorical variables.
2. Split the data into training and testing sets.
3. Select a classification algorithm (e.g., logistic regression, random forest) and train it on the training data.
4. Evaluate the model's performance using appropriate classification metrics (e.g., accuracy,precision, recall) on the testing data.
5. Analyze the model's performance across different cuisines and identify any challenges or biases.


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load dataset
df1 = pd.read_csv('Dataset.csv')

# Check available columns
print(df1.columns)

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')


In [3]:
# Assuming 'Value Label' doesn't exist, let's filter the columns we need
df = df1[['Restaurant Name', 'Cuisines','City','Longitude','Latitude','Price range']]

# Drop rows with missing values in 'Cuisines'
df.dropna(subset=['Cuisines'], inplace=True)

In [4]:
df.head()

Unnamed: 0,Restaurant Name,Cuisines,City,Longitude,Latitude,Price range
0,Le Petit Souffle,"French, Japanese, Desserts",Makati City,121.027535,14.565443,3
1,Izakaya Kikufuji,Japanese,Makati City,121.014101,14.553708,3
2,Heat - Edsa Shangri-La,"Seafood, Asian, Filipino, Indian",Mandaluyong City,121.056831,14.581404,4
3,Ooma,"Japanese, Sushi",Mandaluyong City,121.056475,14.585318,4
4,Sambo Kojin,"Japanese, Korean",Mandaluyong City,121.057508,14.58445,4


In [5]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [6]:
df['Cuisines'] = df['Cuisines'].apply(lambda x: x.split(', '))  # Tokenize cuisines

# Train a Word2Vec model
model = Word2Vec(df['Cuisines'].values, min_count=1)

In [7]:
# Create restaurant vectors
df['Vector'] = df['Cuisines'].apply(lambda x: np.mean([model.wv[word] for word in x], axis=0))

# Prepare data for classification
X = np.stack(df['Vector'].values)
y = LabelEncoder().fit_transform(df['Restaurant Name'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Now clf can be used to predict the restaurant name for new cuisine data

In [8]:
from sklearn.metrics import classification_report

# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         0
          18       0.00      0.00      0.00         0
          22       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         0
          29       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          40       0.00      0.00      0.00         0
          41       0.00      0.00      0.00         1
          45       0.04      1.00      0.07         3
          53       0.00      0.00      0.00         1
          54       0.00    

In [9]:
new_data = ['Italian', 'Pizza']  # Example data

# Convert the new data to a vector
new_vector = np.mean([model.wv[word] for word in new_data], axis=0)

# Use the trained classifier to predict the restaurant name
predicted_label = clf.predict([new_vector])

# Print the predicted label
print(predicted_label)

[4953]


In [11]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

In [13]:
# Reduce dimensionality
pca = PCA(n_components=10)
df['Vector'] = list(pca.fit_transform(list(df['Vector'])))

In [14]:
# Prepare data for classification
X = np.stack(df['Vector'].values)
y = LabelEncoder().fit_transform(df['Restaurant Name'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         0
          22       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         0
          29       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         0
          33       0.00      0.00      0.00         0
          35       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         1
          45       0.04      1.00      0.07         3
          49       0.00      0.00      0.00         0
          53       0.00      0.00      0.00         1
          54       0.00    