<a href="https://colab.research.google.com/github/RPW-11/Data-Preprocessing-and-Simple-EDA/blob/main/Content_Based_Filtering_Restaurant_Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# kaggle API
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# download the data set and unzip
!kaggle datasets download -d ahmedshahriarsakib/uber-eats-usa-restaurants-menus
!unzip /content/uber-eats-usa-restaurants-menus.zip

In [None]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
# load the dataset and preview
df = pd.read_csv('/content/restaurants.csv')
df.head()

## EDA and Preprocessing

In [None]:
df.info()

In [None]:
# drop unecessary columns
df.drop(columns=['position', 'full_address', 'zip_code', 'price_range', 'lat', 'lng'], axis=1, inplace=True)

In [None]:
# counting missing value percentage of each column
missing_df = (df.isnull().sum() / df.shape[0] * 100).reset_index().rename(columns={0: 'missing %'})
missing_df[missing_df['missing %'] > 0]

In [None]:
# handling missing numerical values
for cat in ['score', 'ratings']:
  df[cat].fillna(df[cat].mean(), inplace=True)

In [None]:
# handling missing categorical value
# inspect
df[df['category'].isnull()][['name']]

In [None]:
# drop na values
df.dropna(subset='category', inplace=True)

In [None]:
# recheck missing values
df.isnull().sum()

### Picking the top 20 categories

In [None]:
# get category unique values
cats_dup = df['category'].tolist()
cats_list = []
for line in cats_dup:
  for cat in line.split(', '):
    cats_list.append(cat)

# inspect the number of categories
indexes = pd.value_counts(cats_list).head(30).index
pd.value_counts(cats_list).head(30)

In [None]:
# picking top 20 cuisine excluding American and Sandwiches
cats_20 = [x for x in indexes[:] if x not in ['American', 'Sandwiches']]
cats_20 = cats_20[:20]
cats_20

In [None]:
# filtering out restaurants
ids = []
for cat in cats_20:
  ids += df[df['category'].str.contains(cat)].id.tolist()

print(f"The num of list restaurants {len(ids)}") 
unique_ids = set(ids)
print(f"The num of unique restaurants {len(unique_ids)}") 

In [None]:
# new dataframe
ids = list(unique_ids)
df = df.loc[df.id.isin(ids)]
df.head()

In [None]:
df.info()

### Computing weighted average using bayesian average

In [None]:
# select restaurants that has ratings bigger than 40 percent of the data
num_ratings_threshold = df.ratings.quantile(.4)
print(f"the threshold {num_ratings_threshold}")

#filter the data
print("Filtering...")
print(f"shape before {df.shape}")
df = df.loc[df.ratings >= num_ratings_threshold]
print(f"shape after {df.shape}")

In [None]:
# get the average score
avg_score = df.score.mean()
min_num_rat = df.ratings.min()
print(f"avg score {avg_score} | min ratings {min_num_rat}")

# define the function
def bayesian_average(score, avg_score, nratings, min_ratings):
  return ((score * nratings) + (avg_score * min_ratings))/(nratings + min_ratings)

#invoke the function
df['weighted_score'] = df.apply(lambda x: bayesian_average(x['score'], avg_score, x['ratings'], min_num_rat), axis=1)

In [None]:
# check the result
df = df.drop(columns=['score', 'ratings'], axis=1)
df.iloc[-10:]

### One-hot encode the features

In [None]:
# one hot encode
for cat in cats_20:
  val = df.loc[:,'category'].str.contains(cat).astype(int).tolist()
  df.loc[:, cat] = val

df.head()

## Merge with menu dataset

In [None]:
menu_df = pd.read_csv('/content/restaurant-menus.csv')
menu_df.head()

In [None]:
menu_df.info()

In [None]:
# preprocess
menu_df['price'] = menu_df['price'].str.replace(" USD", "")
menu_df['price'] = menu_df['price'].astype(float)
menu_df.dtypes

In [None]:
menu_df.isnull().sum()

In [None]:
# get the price range
avg_menu = menu_df.groupby('restaurant_id')[['price']].mean().reset_index().rename(columns={'restaurant_id':'id', 'price':'avg_price'})
avg_menu.head()

In [None]:
# merge
df = pd.merge(df, avg_menu, on='id')
df.info()

In [None]:
df = df[['id', 'name', 'category', 'weighted_score','avg_price'] + cats_20]
df.head()

## Training The Model

In [None]:
item_df = df[['weighted_score','avg_price'] + cats_20]
item_df.head()

## Model Architecture

### Using similarity matrix

In [None]:
X_data = item_df.values
# normalize using standard scaler
scaler = StandardScaler()
X_data = scaler.fit_transform(X_data)

# Step 2: Compute Similarity
# Convert the data to TensorFlow tensors
X_tensor = tf.constant(X_data, dtype=tf.float32)

# Compute pairwise using tf.matmul
similarity_matrix = tf.matmul(X_tensor, X_tensor, transpose_b=True)

# Convert the similarity matrix to a numpy array
similarity_matrix_np = similarity_matrix.numpy()

# Step 3: Print Similarity Results
# Print the similarity matrix
print(similarity_matrix_np)

In [None]:
csv = pd.DataFrame(data=similarity_matrix_np)
csv.to_csv('cosine_similarity.csv')

In [None]:
n = 10
target = 0
top_n_indices = np.argsort(similarity_matrix_np[target])[-n-1:-1][::-1]
top_n_indices

In [None]:
target = np.array(target)
df.iloc[np.concatenate((target.reshape(-1,), top_n_indices), axis=0)]

### Generating User Dataframe

In [None]:
# User
n_user = 200
limit = df.shape[0]-1
user_data = np.array([])
for i in range(n_user):
  n_res = np.random.randint(50,200)
  res_id = np.random.randint(0,limit,(n_res,1))
  num_transaction = np.tile(np.random.randint(0,50, 20), (n_res,1))
  per_user = np.concatenate((res_id, num_transaction),axis=1)
  user_data = np.concatenate((user_data.reshape(-1,21), per_user), axis=0)

user_df = pd.DataFrame(data=user_data, columns=['ind'] + cats_20).set_index('ind')
print(f"user_df shape {user_df.shape}")
user_df.head()

### Generating item_df based on user

In [None]:
new_df = item_df.merge(user_df, how='inner', left_index=True, right_index=True, suffixes=('','_y'))
new_df = new_df[[col for col in new_df.columns if '_y' not in col]]
print(f"item shape: {new_df.shape}")
new_df.head()

In [None]:
# y labels (actual ratings from the user)
y = (5-1)* np.random.random_sample((new_df.shape[0],)) + 1

In [None]:
# Scale the training data
item_df = new_df
# scale training data
item_train_unscaled = item_df
user_train_unscaled = user_df
y_train_unscaled    = y

scalerItem = StandardScaler()
scalerItem.fit(item_df)
item_train = scalerItem.transform(item_df)

scalerUser = StandardScaler()
scalerUser.fit(user_df)
user_train = scalerUser.transform(user_df)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y.reshape(-1, 1))
y_train = scalerTarget.transform(y.reshape(-1, 1))

In [None]:
# split the dataset
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

In [None]:
# MODEL ARCHITECTURE
num_outputs = 16
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(user_train.shape[1]))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(item_train.shape[1]))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

In [None]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [None]:
tf.random.set_seed(1)
model.fit([user_train, item_train], y_train, epochs=30)

In [None]:
model.evaluate([user_test, item_test], y_test)

In [None]:
# generate and replicate the user vector to match the number movies in the data set.
pref = np.random.randint(0,50,20)
user_vecs = np.tile(pref, (df.shape[0],1))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(df[item_df.columns])

# make a prediction
y_p = model.predict([suser_vecs, sitem_vecs])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = df.iloc[sorted_index]  #using unscaled vectors for display

In [None]:
print(pref)
print(sorted_ypu[:10].reshape(-1,))
sorted_items[['name','category']].head(10)

In [None]:
model.save('restaurant_prediction')