***
## 4. MODELLING
***
In this section we will create a system using the datasets to recommend restaurants based on different aspects. There are different types of recommentation models, in this project we will focus on three types of recommendation systems:

 > 1. Content-Based Filtering.
 > 2. Collaborative Filtering.
 > 3. Deep Neural Networks.

### CONTENT BASED FILTERING

By utilizing restaurant features such as types of cuisine they offer or if they have WiFi, Alcohol, Happy Hour, Noise Level, Restaurants Attire, Wheelchair Accessible, Restaurants TableService etc, we are able to use cosine similarity to recommend the  restaurants with the closest similarity.

In [85]:
# importing necessary libraries
import requests
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
# Loading the restaurant data from the pickled file
df = pd.read_pickle('pickled_files/restaurant_data.pkl')

# Overview of dataset information to understand the features we require
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38552 entries, 0 to 38551
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   business_id      38552 non-null  object 
 1   name             38552 non-null  object 
 2   address          38552 non-null  object 
 3   city             38552 non-null  object 
 4   state            38552 non-null  object 
 5   postal_code      38552 non-null  object 
 6   latitude         38552 non-null  float64
 7   longitude        38552 non-null  float64
 8   stars            38552 non-null  float64
 9   review_count     38552 non-null  int64  
 10  is_open          38552 non-null  int64  
 11  attributes       38552 non-null  object 
 12  categories       38552 non-null  object 
 13  hours            38552 non-null  object 
 14  location         38552 non-null  object 
 15  attributes_true  38552 non-null  object 
dtypes: float64(3), int64(2), object(11)
memory usage: 4.7+ MB


In [89]:
# Preprocessing function
def preprocess(df):
    """
    Function to preprocess the data to combine the needed features into one column
    Returns a dataframe with the combined_features columns
    """
    filtered_df=df.copy()
    # Combining the features into one column
    filtered_df['combined_features'] = (
                                        filtered_df['attributes'] + " " +
                                        filtered_df['attributes_true'] 
                                        )
    # resetting the index
    filtered_df = filtered_df.reset_index(drop=True)

    # Return turns the filtered df
    return filtered_df

In [91]:
# Vectorization function
def create_feature_vectors(df):
    """
    Performing vectorization of the preprocessed categorical features 
    and combining with the numerical features
    """
    # Vectorize the combined text features
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['combined_features'])
    
    # Combine the TF-IDF matrix with numerical columns
    numerical_features = df[['stars']].values
    combined_features = np.hstack((tfidf_matrix.toarray(), numerical_features))
    
    return combined_features

In [101]:
# Recommendation function
def recommendation(df, state, name=None, category=None):
    """
    Creates recommendation based on name or category/cuisine using cosine similarity and filtering
    Returns a dataframe containing name, state, city, address, stars and categories
    """
    preprocessed = preprocess(df)
    
    def cuisines(cuisine=None, state=state):
        """
        Function to filter to get the recommendations based on cuisine input
        """
        preprocessed=df[df["state"]==state]
        cuisine_df = preprocessed[preprocessed['categories'] == cuisine]
        cuisine_df_sorted = cuisine_df.sort_values(by=["stars", "city"], ascending=False)
        return cuisine_df_sorted[['name', 'state', 'city', 'stars', 'address', 'categories']]
    
    if name:
        if name not in preprocessed['name'].values:
            raise ValueError(f"Restaurant with name '{name}' not found in the filtered data.")

        # Finding the index of the restaurant name
        idx = preprocessed[preprocessed['name'] == name].index[0]
        exclude_names = [name]

        # Locating the restaurant row in the preprocessed df 
        row_to_add = preprocessed.iloc[idx]
        
        # convering it to a df
        row_to_add_df = pd.DataFrame([row_to_add])     
        
        #generating a df for only the state i want to recommend in
        specific_state= preprocessed[preprocessed["state"] == state]
        
        # concatinating it to the specific state df and reseting the index
        specific_state = pd.concat([specific_state, row_to_add_df]).reset_index(drop=True)
        
        # Finding the new index for the restaurant name
        idx = specific_state[specific_state['name'] == name].index[0]
        
        # Creating feature vectors
        combined_features = create_feature_vectors(specific_state)

        # Finding the cosine similarity
        cosine_sim = cosine_similarity(combined_features, combined_features)

        # Finding the top indices of the restaurants to recommend
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_indices = [i[0] for i in sim_scores]  

        # Finding the rows of the top recommended restaurants
        recommended_restaurants = specific_state.iloc[top_indices]
        recommended_restaurants = recommended_restaurants[~recommended_restaurants['name'].isin(exclude_names)]        

        # Return a df with the required features
        return recommended_restaurants[['name', 'state', 'city', 'stars', 'address','categories']].drop_duplicates(subset='name')[:20]
    
    elif category:
        # Filter based on cuisine/cateogry
        return cuisines(category)

In [111]:
# Example recommendations based on state, name
restaurants = recommendation(df, state="Indiana",  name="Coup de Taco")
restaurants.head()

Unnamed: 0,name,state,city,stars,address,categories
1785,El Torito Grill,Indiana,Indianapolis,4.5,8650 Keystone Crossing,Mexican
40,Taste of China,Indiana,Whiteland,4.0,989 N US 31,Chinese
2133,Hong Kong Inn,Indiana,Indianapolis,4.0,8079 E 38th St,Chinese
2062,Diarra's Cuisine,Indiana,Indianapolis,3.5,"2989 W 71st St, Ste 3",African
458,WB Pizza,Indiana,Indianapolis,4.5,2290 W 86th St,American (Traditional)


In [113]:
# Example recommendations based on state, category/cuisine
cuisines = recommendation(df, state="Indiana",  category="Italian")
cuisines.head()

Unnamed: 0,name,state,city,stars,address,categories
19189,Greek’s Pizzeria- Indianapolis,Indiana,Indianapolis,5.0,1601 Columbia Ave,Italian
30169,I Tre Mori,Indiana,Indianapolis,5.0,"8220 E 106th St, Ste 200",Italian
35845,The Twisted Sicilian,Indiana,Indianapolis,5.0,Unknown,Italian
12466,Ciao by Villaggio,Indiana,Zionsville,4.5,40 S Main St,Italian
21713,Convivio Italian Artisan Cuisine - Zionsville,Indiana,Zionsville,4.5,40 S Main St,Italian


## COLLABORATIVE FILTERING

In [333]:
import pandas as pd

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate

In [335]:
df_collab=pd.read_csv("data/users.csv")

In [341]:
df_collab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429771 entries, 0 to 429770
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_id    429771 non-null  object
 1   user_id      429771 non-null  object
 2   business_id  429771 non-null  object
 3   stars        429771 non-null  int64 
 4   useful       429771 non-null  int64 
 5   funny        429771 non-null  int64 
 6   cool         429771 non-null  int64 
 7   text         429771 non-null  object
 8   date         429771 non-null  object
dtypes: int64(4), object(5)
memory usage: 29.5+ MB


In [343]:
users=df_collab[["user_id",'business_id',"stars"]]
users

Unnamed: 0,user_id,business_id,stars
0,iAD32p6h32eKDVxsPHSRHA,YB26JvvGS2LgkxEKOObSAw,5
1,rYvWv-Ny16b1lMcw1IP7JQ,jfIwOEXcVRyhZjM4ISOh4g,1
2,2ntnbUia9Bna62W0fqNcxg,S-VD26LE_LeJNx5nASk_pw,5
3,j4qNLF-VNRF2DwBkUENW-w,yE1raqkLX7OZsjmX3qKIKg,5
4,H3P9EB7J9HP6PzkVjgFiOg,oQ5CPRt0R3AzFvcjNOqB1w,5
...,...,...,...
429766,JWhZs-vSggwN6WgxBHgIDw,B2xtWMBTyDtd-ndqqaIC1Q,1
429767,XJTO9x78TgWE94cmXqNduA,c3QxX3toWdqJnKQmmIliRQ,1
429768,cTozFTTWjlFYc3yusdbZmA,uMVOtr16r1ELu46pWr4HCQ,1
429769,7L7GL5Pi2cf8mbm2Dpw4zw,e_E-jq9mwm7wk75k7Yi-Xw,5


In [353]:
import pickle

In [355]:
with open('pickled_files/user_ratings.pkl', 'wb') as file:
    pickle.dump(users, file)

In [336]:
# Define a Reader object with the rating scale
reader = Reader(rating_scale=(1, 5))

# Load the data from the DataFrame
data = Dataset.load_from_df(df_collab[['user_id', 'business_id', 'stars']], reader)

trainset, testset = train_test_split(data, test_size=0.25)

In [None]:
uss

In [337]:
# Initialize the SVD algorithm
model = SVD()

# Train the model on the training set
model.fit(trainset)

# Predict ratings for the test set
predictions = model.test(testset)

# Compute RMSE
accuracy.rmse(predictions)

RMSE: 1.3685


1.3684877636798007

In [110]:
def recommend_businesses(user_id, n=5):
    # Get all unique business IDs
    all_business_ids = df_collab['business_id'].unique()
    
    # Predict ratings for all businesses the user hasn't rated yet
    user_rated_businesses = df_collab[df_collab['user_id'] == user_id]['business_id']
    recommendations = []
    
    for business_id in all_business_ids:
        if business_id not in user_rated_businesses.values:
            pred = model.predict(user_id, business_id)
            recommendations.append((business_id, pred.est))
    
    # Sort by estimated rating and return top-n
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations[:n]



In [207]:
recommend_businesses(user_id=1)

[('ts9aktMHX8OolHRByAcKBQ', 4.851068494046912),
 ('gP_oWJykA2RocIs_GurKWQ', 4.850111152275558),
 ('DVBJRvnCpkqaYl6nHroaMg', 4.828361413275356),
 ('vHqJerp8iZN1Lje3dLa4Eg', 4.827800620730559),
 ('yAEKaEvsCNxOYv1M1kiQhg', 4.826452753481822)]

In [43]:
import requests

In [44]:
# Load your restaurant data
df = pd.read_csv('data/filtered_restaurants_data.csv')


# Your Yelp API key
API_KEY = 'QO9XAZfxn80KoHc2rPOj9iEhWK2r8EJXfLNH_Q1F2O04d3XpAvdxFiX0Bz1wKge_hR0IMLsbsn2-ObSe0uTx5EWttuS_Yy_6wYvew5D0GXBGru_BV2OkyQDUlQOyZnYx'

# Yelp Business Endpoint
YELP_BUSINESS_URL = "https://api.yelp.com/v3/businesses/"

# Headers for the API request
headers = {
    'Authorization': f'Bearer {API_KEY}',
}

def get_business_image_urls(business_id):
    response = requests.get(f'{YELP_BUSINESS_URL}{business_id}', headers=headers)
    
    if response.status_code == 200:
        business_data = response.json()
        # Extract the image URLs
        image_urls = business_data.get('photos', [])
        return image_urls
    elif response.status_code == 429:
        st.error("Rate limit exceeded. Please try again later.")
    else:
        st.error(f"Failed to retrieve data for Business ID: {business_id}, Status Code: {response.status_code}")
    
    return []

In [45]:
get_business_image_urls('Ep_jh1Pt4Ggyla21f-BQcQ')

['https://s3-media2.fl.yelpcdn.com/bphoto/WwNo_cvEzBp0GBzLSbnuBw/o.jpg',
 'https://s3-media1.fl.yelpcdn.com/bphoto/wg72_hzjKHZX2PKn5BwKOA/o.jpg',
 'https://s3-media2.fl.yelpcdn.com/bphoto/AQT1ZqoUqIrAOd-lAjm_SQ/o.jpg']