#### importing libraries

In [1]:
import pandas as pd
import bs4
import selenium
import requests
import urllib
import csv
from sklearn.metrics.pairwise import cosine_similarity

#### reading the zomato url

In [2]:
from selenium import webdriver
browser = webdriver.Chrome('chromedriver.exe')
url1 = 'https://www.zomato.com/bangalore/great-food-no-bull'
browser.get(url1)

In [3]:
urls = browser.find_elements_by_css_selector('div.relative.top-res-box.entity-ads-snippet-track')


#### From this URL (https://www.zomato.com/bangalore/great-food-no-bull), scrape the following information and store them in a data frame
#### Restaurant Name
#### URL to the restaurant

In [4]:
urls_df = pd.DataFrame(columns=['Restaurant_Name', 'URL'])
for elem in urls:
    url = elem.find_element_by_css_selector('a')
    url = url.get_attribute('href')
    rest_name = elem.find_element_by_css_selector('div.res_title')
    rest_name = rest_name.get_attribute('innerHTML')
    curr_row = {
        'URL' : url, 
        'Restaurant_Name' : rest_name
    }
    
    urls_df = urls_df.append(curr_row,ignore_index = True)
    urls_df['Restaurant_Name'] = urls_df['Restaurant_Name'].str.replace('\n','')
    row = [url,rest_name.strip()]
    with open('rest.csv', 'a', newline = '') as csvFile: ### writing the dataframe to a csv file
        writer = csv.writer(csvFile)
        writer.writerow(row)

In [5]:
urls_df

Unnamed: 0,Restaurant_Name,URL
0,MISU,https://www.zomato.com/bangalore/misu-st-marks...
1,Buff Buffet Buff,https://www.zomato.com/bangalore/buff-buffet-b...
2,One For The Road,https://www.zomato.com/bangalore/one-for-the-r...
3,Delhi Highway,https://www.zomato.com/bangalore/delhi-highway...
4,Red Rhino,https://www.zomato.com/bangalore/red-rhino-whi...
5,Brew and Barbeque - A...,https://www.zomato.com/bangalore/brew-and-barb...
6,Barbecue by Punjab Grill,https://www.zomato.com/bangalore/barbecue-by-p...
7,Bamboo Hut,https://www.zomato.com/bangalore/bamboo-hut-na...
8,Polar Bear,https://www.zomato.com/bangalore/restaurants/p...
9,PurpleBasil.in,https://www.zomato.com/bangalore/purplebasil-i...


#### Loop through each restaurant(atleast 50) and do the following
#### scrape sample reviews, make sure you click the load more button continuously in a loop until all reviews are loaded
#### From each review extract the following
#### Reviewer name
#### Reviewer ID
#### Ratings

In [None]:
reviews_df = pd.DataFrame(columns=['Restaurant_name','User_ID','User_Name','Rating']) #,'Reviews'
for a,elem1 in urls_df.iterrows():
    try: ## to catch the exceptions that we get while scraping data from restaurants(to skp these kind of restaurantsS)
        browser.get(elem1['URL'])
        rest_name = elem1['Restaurant_Name']
        while(True): ### load more
            try:
                more_sel = 'div.load-more'
                load_more_button = browser.find_element_by_css_selector(more_sel)
                load_more_button.click()
            except Exception:
                break;
        reviews = browser.find_elements_by_css_selector('div.ui.segment.clearfix.brtop')
        for elem in reviews:
            name = elem.find_element_by_css_selector('a[data-entity_id]')
            uid = name.get_attribute('data-entity_id')
            name = name.get_attribute('innerHTML')#.splitlines()[1]
            rating =  elem.find_element_by_css_selector('div[aria-label]')
            rating = rating.get_attribute('aria-label')
            curr_row = {
                'Restaurant_name' : rest_name,
                'User_ID' : uid,
                'User_Name': name,
                'Rating' : rating,
                #'Reviews' : rev
            }

            reviews_df = reviews_df.append(curr_row,ignore_index = True)
            reviews_df['Rating'] = reviews_df['Rating'].str.replace('[^0-9.]','')
            reviews_df['User_Name'] = reviews_df['User_Name'].str.replace('[^a-zA-Z]','')
            #reviews_df['Reviews'] = reviews_df['Reviews'].str.replace('[^a-zA-Z0-9 ]','')
            #reviews_df['Reviews'] = reviews_df['Reviews'].str.replace('RATED','')
            row = [rest_name.strip(), uid,name.strip(),rating]
            with open('review.csv', 'a', newline='') as csvFile: ### writing the dataframe to a csv file
                writer = csv.writer(csvFile)
                writer.writerow(row)
    except Exception:
                continue
    if(str(a)=='70'): ### restricting the data to 70 rows so that we get details of atleast 50 restaurants
        break

#### writing the data from dataframe to a csv file

In [None]:
reviews = pd.read_csv('review.csv', encoding = "latin")
reviews.head()

#### pivot table for the reviews DF

In [None]:
user_item_matrix = reviews.pivot_table(index='User_ID',
                   columns = 'Restaurant_Name',
                   values='Rating')
print(user_item_matrix.shape)
user_item_matrix.head()

####  Perform exploratory data analysis using the above data

In [None]:
# 1. Avg no. of ratings per user

user_item_matrix.count(axis=1).mean() ## per user avg

In [None]:
# 2. User with most no. of ratings

user_item_matrix.count(axis=1).sort_values(ascending = False).head(1)

In [None]:
# 3. Avg rating per restaurant

user_item_matrix.count().mean()

In [None]:
# 4. Restaurant with most no. of ratings

user_item_matrix.count().sort_values(ascending = False).head(1)

In [None]:
## cosine_similarity does not accept missing values
 
ui_matrix = user_item_matrix.apply(lambda v: v - v.mean(), axis = 1).fillna(0)

In [None]:
sim_matrix = pd.DataFrame(cosine_similarity(ui_matrix),
            index = ui_matrix.index,
            columns=ui_matrix.index)

In [None]:
sim_matrix

In [None]:
User the above data, build a recommendation engine using User Based Collaborative Filtering. 
Create a function which takes User ID as input and return 3 restaurants that you would recommend to the user.

In [None]:
def ubcf(userid): 
    preds = pd.DataFrame(columns=['rname', 'predicted_rating'])
    neighs_user = sim_matrix[userid].drop(userid).sort_values(ascending = False).head(7).index
    for rname, rating in user_item_matrix.loc[userid].items(): 
        if pd.isnull(rating):
            rname_ratings = user_item_matrix.loc[neighs_user, rname]
            avg_rname_rating = rname_ratings.mean() # predicting the rating for the restaurants based on the avg rating given by neighbours
            preds = preds.append({'rname' : rname,
                                 'predicted_rating': avg_rname_rating},
                                 ignore_index = True)
            rest_recom = preds.sort_values('predicted_rating', ascending=False).head(3)['rname']
    return rest_recom.values.tolist()

#### Sample input and output

In [None]:
ubcf(19800)