In [1]:
"""
This file loads the data from the data directory and shows you how.
Feel free to change the contents of this file!
Do ensure these functions remain functional:
    - get_business(city, business_id)
    - get_reviews(city, business_id=None, user_id=None, n=10)
    - get_user(username)
"""
import numpy as np
import pandas as pd
import os
import json
import random
from collections import Counter
import math
import geopy.distance
import sklearn.metrics.pairwise as pw


DATA_DIR = "data"


def load_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a dictionary of the form:
        {
            <city1>: [<entry1>, <entry2>, ...],
            <city2>: [<entry1>, <entry2>, ...],
            ...
        }
    """
    data = {}
    for city in cities:
        city_data = []
        with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
            for line in f:
                city_data.append(json.loads(line))
        data[city] = city_data
    return data


def get_business(city, business_id):
    """
    Given a city name and a business id, return that business's data.
    Returns a dictionary of the form:
        {
            name:str,
            business_id:str,
            stars:str,
            ...
        }
    """
    for business in BUSINESSES[city]:
        if business["business_id"] == business_id:
            return business
    raise IndexError(f"invalid business_id {business_id}")


def get_reviews(city, business_id=None, user_id=None, n=10):
    """
    Given a city name and optionally a business id and/or auser id,
    return n reviews for that business/user combo in that city.
    Returns a dictionary of the form:
        {
            text:str,
            stars:str,
            ...
        }
    """
    def should_keep(review):
        if business_id and review["business_id"] != business_id:
            return False
        if user_id and review["user_id"] != user_id:
            return False
        return True

    reviews = REVIEWS[city]
    reviews = [review for review in reviews if should_keep(review)]
    return random.sample(reviews, min(n, len(reviews)))


def get_user(username):
    """
    Get a user by its username
    Returns a dictionary of the form:
        {
            user_id:str,
            name:str,
            ...
        }
    """
    for city, users in USERS.items():
        for user in users:
            if user["name"] == username:
                return user
    raise IndexError(f"invalid username {username}")


CITIES = load_cities()
USERS = load(CITIES, "user")
BUSINESSES = load(CITIES, "business")
REVIEWS = load(CITIES, "review")
TIPS = load(CITIES, "tip")
CHECKINS = load(CITIES, "checkin")

In [2]:
#making the dataframe to easily extract information for the test-dataframe
def recommend(user_id=None, business_id=None, city=None, n=10):
    """
    Returns n recommendations as a list of dicts.
    Optionally takes in a user_id, business_id and/or city.
    A recommendation is a dictionary in the form of:
        {
            business_id:str
            stars:str
            name:str
            city:str
            adress:str
        }
    """
    
    #make dataframe
    df_BUSINESS = pd.DataFrame()
    
    #make lists to append all values to
    all_ids = []
    all_names = []
    all_cities = []
    all_stars = []
    all_review_count = []
    all_is_open = []
    all_attributes = []
    all_categories = [] 
    all_latitude = []
    all_longitude = []
    all_attributes = []
    all_categories = []
    
    #search in the data and add values to the lists
    
    for city in BUSINESSES:
        for features in BUSINESSES[city]:
            all_ids.append(features['business_id'])
            all_names.append(features['name'])
            all_cities.append(features['city'])
            all_stars.append(features['stars'])
            all_review_count.append(features['review_count'])
            all_is_open.append(features['is_open'])
            all_latitude.append(features['latitude'])
            all_longitude.append(features['longitude'])
            all_categories.append(features['categories'])
            bag = []
            
            #check if the business has attributes and add a list of attributes
            if features['attributes'] != None:
                for element in features['attributes']:
                    if element:
                        bag.append(element)
            all_attributes.append(bag)

    #make columns in the dataframe 
    df_BUSINESS['busId'] = all_ids
    df_BUSINESS['name'] = all_names
    df_BUSINESS['city'] = all_cities
    df_BUSINESS['stars'] = all_stars
    df_BUSINESS['review_count'] = all_review_count
    df_BUSINESS['is_open'] = all_is_open
    df_BUSINESS['latitude'] = all_latitude
    df_BUSINESS['longitude'] = all_longitude
    df_BUSINESS['attributes'] = all_attributes
    df_BUSINESS['categories'] = all_categories
    
    if not city:
        city = random.choice(CITIES)
    return df_BUSINESS
df_BUSINESS = recommend(user_id=None, business_id=None, city=None, n=10)
df_BUSINESS.head()

Unnamed: 0,busId,name,city,stars,review_count,is_open,latitude,longitude,attributes,categories
0,Y6iyemLX_oylRpnr38vgMA,Vita Bella Fine Day Spa,Glendale,5.0,8,0,33.654815,-112.188568,"[RestaurantsPriceRange2, ByAppointmentOnly, Bu...","Nail Salons, Beauty & Spas, Day Spas"
1,0_ohldeFwysbglrTLSGM4Q,The Lounge Barber Shop,Glendale,3.5,6,0,33.623871,-112.186448,"[ByAppointmentOnly, RestaurantsPriceRange2, Bu...","Beauty & Spas, Barbers"
2,iA6IwYzo6qDmY7dZfXdzyg,"Wyatt Brad, DMD",Glendale,5.0,5,0,33.567806,-112.182662,[ByAppointmentOnly],"Health & Medical, Dentists, General Dentistry"
3,OOnIQMkjg1VpAcPRFLnM0g,Stein Mart,Glendale,4.0,7,1,33.64039,-112.187987,"[BikeParking, BusinessParking, BusinessAccepts...","Shopping, Fashion, Home & Garden, Department S..."
4,MvKj4_etnStgYRnXOwFSPw,Glendale Care Center,Glendale,1.0,3,1,33.563827,-112.161232,[BusinessAcceptsCreditCards],"Community Service/Non-Profit, Local Services"


In [3]:
#making the dataframe to easily extract information for the test-dataframe
def user_df():
    #create lists that represent columns
    all_bus_ids = []
    all_user_ids = []
    all_stars = []
    #add values from the data to lists
    for city in REVIEWS:
        for features in REVIEWS[city]: 
            if features['user_id'] not in all_user_ids:
                all_user_ids.append(features['user_id'])
                all_bus_ids.append(features['business_id'])
                all_stars.append(features['stars'])

            
    #create dataframe
    df_REVIEWS = pd.DataFrame()
    
    #add all the listvalues to the dataframe
    df_REVIEWS['userId'] = all_user_ids
    df_REVIEWS['stars'] = all_stars
    df_REVIEWS['busId'] = all_bus_ids
    
    return df_REVIEWS

df_USERS = user_df()
df_USERS.head()
#len(df_USERS)

Unnamed: 0,userId,stars,busId
0,wNyiw6GfVfn5Kphqmux1gw,1.0,5OZlLXjU0FXUbrw8Scja6g
1,r31sCain9mZlv6E9eZUv0g,5.0,N06GSdupPPte-hP0YRFtdA
2,eLzaX8tArcVTI9qOrZxHJw,4.0,Lm61pehc-oyqR6DBAAQNCg
3,S6boRo4sQ1onCOpxhWkobw,1.0,KGqvUQQA4c9GJpi5-znsew
4,7Gl4ah_uRVmXPro_J2PFRg,4.0,nZAJ2a2-Oyxt6GzX97i2pg


In [4]:
#making the dataframe to easily extract information for the test-dataframe
def review_df():
    #create lists that represent columns
    all_bus_ids = []
    all_user_ids = []
    all_stars = []
    all_reviews = []
    #add values from the data to lists
    for city in REVIEWS:
        for features in REVIEWS[city]:
            all_bus_ids.append(features['business_id'])
            all_stars.append(features['stars'])
            all_user_ids.append(features['user_id'])
            all_reviews.append(features['text'])
            
    #create dataframe
    df_REVIEWS = pd.DataFrame()
    
    #add all the listvalues to the dataframe
    df_REVIEWS['userId'] = all_user_ids
    df_REVIEWS['stars'] = all_stars
    df_REVIEWS['busId'] = all_bus_ids
    df_REVIEWS['review'] = all_reviews
    
    return df_REVIEWS
df_REVIEWS = review_df()
df_REVIEWS.head()
#len(df_REVIEWS)

Unnamed: 0,userId,stars,busId,review
0,wNyiw6GfVfn5Kphqmux1gw,1.0,5OZlLXjU0FXUbrw8Scja6g,They keep there appointments on time and are p...
1,r31sCain9mZlv6E9eZUv0g,5.0,N06GSdupPPte-hP0YRFtdA,I LOVE this place! I have found the staff to b...
2,eLzaX8tArcVTI9qOrZxHJw,4.0,Lm61pehc-oyqR6DBAAQNCg,Good fresh and home cooked food. Loved breakfa...
3,S6boRo4sQ1onCOpxhWkobw,1.0,KGqvUQQA4c9GJpi5-znsew,So I've been coming here for years and have ne...
4,7Gl4ah_uRVmXPro_J2PFRg,4.0,nZAJ2a2-Oyxt6GzX97i2pg,"Great wings , and the breakfast is pretty good..."


In [5]:
#this function creates a dataframe for users that reviewed more than 50 businesses
def reviews_test(reviews):
    
    all_user_ids = []
    all_active_users = []
    active_reviews_df = pd.DataFrame()
    
    #add values from the data to lists
    for city in REVIEWS:
        for features in REVIEWS[city]:
            all_user_ids.append(features['user_id'])
    
    #check the amount of reviews each user has placed
    review_amount = Counter(all_user_ids)
    
    #putting all users that reviewed more than 50 businesses in one dataframe and extract the other information from it
    for user in review_amount:
        if review_amount[user] > 50:
            all_active_users.append(user)
 
    for x in all_active_users:
        active_users = df_REVIEWS[(df_REVIEWS['userId'] == x)]
        active_reviews_df = active_reviews_df.append(active_users)
    
    return active_reviews_df

df_TESTREVIEWS = reviews_test(df_REVIEWS)
df_TESTREVIEWS.head()

Unnamed: 0,userId,stars,busId,review
16,9hMqNBBpTl3lBy6qCAyu-A,4.0,C5H-eZfnxBkYN40xcNbDPQ,I have only been here a few times now but it i...
307,9hMqNBBpTl3lBy6qCAyu-A,4.0,He9Z_YfS6fucxkDTCVPv1Q,Cheap tools! I love this place since they carr...
1370,9hMqNBBpTl3lBy6qCAyu-A,4.0,eHhBGqQbDpCcSpz7SZPTbA,"I love NYPD, the pizza is always awesome they ..."
1795,9hMqNBBpTl3lBy6qCAyu-A,4.0,BSQgdjMm5pYSWuGyr-8pIA,The number 12 is the Beach Club turkey sandwic...
2068,9hMqNBBpTl3lBy6qCAyu-A,5.0,dQQ8Qb0K5U97qkcLCZkkWQ,"I am a big fan of Fuddruckers, I think the the..."


In [6]:
def test_business():

    all_active_bus = []
    all_business_ids = [] 
    active_bus_df = pd.DataFrame()
    
    #add values from the data to lists
    for city in REVIEWS:
        for features in REVIEWS[city]:
            all_business_ids.append(features['business_id'])
           
    #check the amount of reviews each business has received from people in the specific town
    review_amount = Counter(all_business_ids)

    
    #putting all businesses that received more than 300 reviews in one dataframe and extract the other information from it
    for bus in review_amount:
        if review_amount[bus] > 300:
            all_active_bus.append(bus)
    for x in all_active_bus:
        active_bus = df_BUSINESS[(df_BUSINESS['busId'] == x)]
        active_bus_df = active_bus_df.append(active_bus)

    
    return active_bus_df

df_BUSTEST = test_business()
df_BUSTEST.head()

Unnamed: 0,busId,name,city,stars,review_count,is_open,latitude,longitude,attributes,categories
353,Lm61pehc-oyqR6DBAAQNCg,Kiss the Cook Restaurant,Glendale,4.0,338,1,33.538165,-112.165516,"[RestaurantsAttire, RestaurantsReservations, B...","Restaurants, Breakfast & Brunch"
225,-Dnh48f029YNugtMKkkI-Q,La Piazza al Forno,Glendale,4.0,513,1,33.538396,-112.183975,"[BusinessAcceptsCreditCards, RestaurantsPriceR...","Pizza, Italian, Restaurants"
286,thlAnPN1ApoNxSnok_fcvA,Black Bear Diner,Glendale,4.0,305,1,33.638271,-112.190578,"[BusinessParking, Caters, RestaurantsDelivery,...","Breakfast & Brunch, American (New), Diners, Re..."
77,IyVdd_IqwUtzQDTxw2W9qw,Saddle Ranch Chop House,Glendale,2.5,574,1,33.532952,-112.260762,"[RestaurantsGoodForGroups, HasTV, BestNights, ...","American (Traditional), Nightlife, Steakhouses..."
363,gA9hCYY7MYl9oZ3aym5dvw,Tokyo Lobby Sushi and Grill,Glendale,4.0,342,1,33.638374,-112.183773,"[RestaurantsAttire, BusinessAcceptsCreditCards...","Restaurants, Sushi Bars, Japanese"


In [7]:
def test_users():
    
    active_users = set()
    all_active_users = pd.DataFrame()
    #filtering all the duplicates out of the dataframe 
    for x in df_TESTREVIEWS['userId']:
        active_users.add(x)
        
    #converting set to list to add the list to the dataframe
    active_users = list(active_users)
    all_active_users['userId'] = active_users
    
    return all_active_users

df_TESTUSERS = test_users()
df_TESTUSERS

Unnamed: 0,userId
0,BTxxGYdw8CNV7UZGc9YU1Q
1,el3TmKFEFzZOcNbCw2FNlQ
2,Q4Qfu-3vYtL1LRm2X1b0Gg
3,hdP51SHHpIzOVywwxqs5fA
4,iDlkZO2iILS8Jwfdy7DP9A
5,kbfpED-6FURIsNlsAXqc2g
6,XEHZoTDWjw3w-gcQyYIe4g
7,ffPY_bHX8vLebHu8LBEqfg
8,PdgpUK6fHArEXsPcKFHXOg
9,ZQsculZs_WMVFnBGQee1EQ


In [8]:
def attribute_similarity(matrix, id1, id2):
    similar = 0
    bag = []
    
    #search for all features with the given id
    feature1 = matrix[(matrix['busId'] == id1)]['attributes'].item()
    feature2 = matrix[(matrix['busId'] == id2)]['attributes'].item()
    #append all the items to a bag of features of item 1
    for item1 in feature1:
        bag.append(item1)
        
    #append all the items to a bag of features of item 2
    for item2 in feature2:
        bag.append(item2)
        
    #counting all the words and see if the words in the bags are simalair
    count_bag = Counter(bag)
    total_words = len(bag)
    for element in count_bag:
        if count_bag[element] > 1:
            similar += count_bag[element]
    if total_words == 0:
        return total_words
    
    #return the percentage of similair attributes
    return similar/total_words

In [9]:
def categories_similarity(matrix, id1, id2):
    similar = 0
    bag = []
        
    #search for all features with the given id
    feature1 = matrix[(matrix['busId'] == id1)]['categories'].item()
    feature2 = matrix[(matrix['busId'] == id2)]['categories'].item()
    
    if feature1 == None or feature2 == None:
        return 0
        
    #append all the items to a bag of features of item 1
    for item1 in feature1:
        bag.append(item1)
        
    #append all the items to a bag of features of item 2
    for item2 in feature2:
        bag.append(item2)
        
    #counting all the words and see if the words in the bags are simalair
    count_bag = Counter(bag)
    total_words = len(bag)
    for element in count_bag:
        if count_bag[element] > 1:
            similar += count_bag[element]
    if total_words == 0:
        return total_words
        
    #return the percentage of similair categories
    return similar/total_words


In [10]:
def sim_matrix(matrix):
    #create an empty dataframe
    similarity_matrix = pd.DataFrame(matrix, index = matrix['busId'], columns = matrix['busId'])
    #get the index of the matrix
    business_ids = matrix['busId']
    
    #iterate over all the business ids and add the similarity in the matrix
    for business in business_ids:
        for business2 in business_ids:
            similarity_matrix.loc[business][business2] = ((attribute_similarity(matrix, business, business2)* 0.5) + categories_similarity(matrix, business, business2)) 
            if business2 == business:
                similarity_matrix.loc[business][business2] = 0

    return similarity_matrix

sim_matrix = sim_matrix(df_BUSTEST)
sim_matrix.head()

busId,Lm61pehc-oyqR6DBAAQNCg,-Dnh48f029YNugtMKkkI-Q,thlAnPN1ApoNxSnok_fcvA,IyVdd_IqwUtzQDTxw2W9qw,gA9hCYY7MYl9oZ3aym5dvw,eSENXDnnNSo-MLLkWJQ8Vg,6KqLBLkHw78t33qObWJxpQ,fOnpoMXToa5Qiyg95E6ENg,fS9a8AGrcwWPf_9vfn5wPQ,woP-omtF0PobX4JxXpZa7Q,...,vOyZkFAknjVg-Fwye3TKaQ,GPVHfoDjSOivqgIDjIOA-Q,KyUFj09bfowyddCuOTP4mg,ojGCRzEym2W3tSTo69fhxA,G4X5w5vRQNEFUmrJo6JakA,yydyoWCY3LJYlPwVLKbJ3Q,qdHFUvFhZjvNie3cZiJb9w,gcNC4k7TZJVX_1YHdIkDNQ,qXoVYptJRQg6Jt7NLvGv-A,YM4nM68YTzru8Hj_c5SvHA
busId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lm61pehc-oyqR6DBAAQNCg,0.0,1.335753,1.406457,1.396429,1.361486,1.403201,1.306366,1.390742,1.385569,1.473684,...,1.352472,1.323077,1.325536,1.391176,1.266304,1.28431,1.364894,1.390596,1.348862,1.277209
-Dnh48f029YNugtMKkkI-Q,1.335753,0.0,1.360641,1.357407,1.353153,1.37611,1.31339,1.330931,1.381725,1.362069,...,1.352632,1.372425,1.4,1.362755,1.246377,1.288811,1.341667,1.356052,1.352851,1.284255
thlAnPN1ApoNxSnok_fcvA,1.406457,1.360641,0.0,1.432249,1.370784,1.389465,1.348495,1.466619,1.420221,1.406457,...,1.415992,1.371375,1.356567,1.437421,1.343028,1.34225,1.410265,1.409563,1.367788,1.379411
IyVdd_IqwUtzQDTxw2W9qw,1.396429,1.357407,1.432249,0.0,1.391363,1.374207,1.355691,1.426816,1.411224,1.396429,...,1.409732,1.378155,1.363462,1.408835,1.386775,1.341682,1.400794,1.412758,1.372909,1.422078
gA9hCYY7MYl9oZ3aym5dvw,1.361486,1.353153,1.370784,1.391363,0.0,1.366444,1.323684,1.34375,1.418919,1.361486,...,1.328071,1.32443,1.361486,1.346154,1.255556,1.261594,1.326122,1.473333,1.322999,1.287531


In [11]:
def split_data(data,d = 0.75):
    """ split data in a training and test set 
       `d` is the fraction of data in the training set"""
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    return data[mask_test], data[~mask_test]
training_set, test_set = split_data(df_TESTREVIEWS)
#training_set
test_set.head()

Unnamed: 0,userId,stars,busId,review
307,9hMqNBBpTl3lBy6qCAyu-A,4.0,He9Z_YfS6fucxkDTCVPv1Q,Cheap tools! I love this place since they carr...
1795,9hMqNBBpTl3lBy6qCAyu-A,4.0,BSQgdjMm5pYSWuGyr-8pIA,The number 12 is the Beach Club turkey sandwic...
6538,9hMqNBBpTl3lBy6qCAyu-A,4.0,EPxGfzUFrn4IHBctlfKF9g,"I love this gym, I do wish it were closer to a..."
11607,9hMqNBBpTl3lBy6qCAyu-A,5.0,fOnpoMXToa5Qiyg95E6ENg,"This place rocks, great diner with great food...."
20688,9hMqNBBpTl3lBy6qCAyu-A,5.0,Qa_jibFadTx2iUx99yTKsw,Brian with Man Cave Entertainment shares this ...


In [12]:
def utility_matrix(bus_df, user_df, review_df):
    #create an empty dataframe
    utility_matrix = pd.DataFrame(index = bus_df['busId'], columns = user_df['userId'])
    rating_amount = []
    #get the index of the dataframes
    business_ids = bus_df['busId']
    user_ids = user_df['userId']
    
    #iterate over all the business ids and add the values if possible
    for business in business_ids:
        for user in user_ids:
            rating = review_df[(review_df['busId'] == business) & (review_df['userId'] == user)]['stars']
            if len(rating) is 1:
                rating_amount.append(rating)
                utility_matrix.loc[business][user] = rating.item()
    
    print(len(rating_amount))
    return utility_matrix

utility = utility_matrix(df_BUSTEST, df_TESTUSERS, df_TESTREVIEWS)
utility.head()


334


userId,BTxxGYdw8CNV7UZGc9YU1Q,el3TmKFEFzZOcNbCw2FNlQ,Q4Qfu-3vYtL1LRm2X1b0Gg,hdP51SHHpIzOVywwxqs5fA,iDlkZO2iILS8Jwfdy7DP9A,kbfpED-6FURIsNlsAXqc2g,XEHZoTDWjw3w-gcQyYIe4g,ffPY_bHX8vLebHu8LBEqfg,PdgpUK6fHArEXsPcKFHXOg,ZQsculZs_WMVFnBGQee1EQ,...,MMf0LhEk5tGa1LvN7zcDnA,SR9KgvoaxcBsSd4Gds8LnQ,Z128ihQea7BLPh2T9q9sKA,DPitNu466172os6m0Yri1Q,qgtRPFqskbohnS80_t9ORQ,bnn_DmLZEi9n0E4OMuRXbA,4d13xAX2jp2EbGF8I9eZZw,TCPLaprJm4FFei2DkW6K6g,z5TgzjEw8YfEbD4KkgKRxw,2D2bPTst5z5BdaHTdYTA-Q
busId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lm61pehc-oyqR6DBAAQNCg,,,3.0,,,3.0,,,,,...,,,3.0,1.0,,,,,,
-Dnh48f029YNugtMKkkI-Q,,5.0,4.0,,3.0,,4.0,,,,...,,,3.0,4.0,,,,4.0,,
thlAnPN1ApoNxSnok_fcvA,4.0,,4.0,,,,,4.0,5.0,,...,,,3.0,,,,,3.0,,
IyVdd_IqwUtzQDTxw2W9qw,,,,,5.0,,,4.0,,,...,,,,,,,,,,
gA9hCYY7MYl9oZ3aym5dvw,5.0,5.0,,,,,,,,,...,,,,,,,,,,


In [13]:
def predict_vectors(user_ratings, similarities):
    # select only movies actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm


def predict_ids(similarity, utility, userId, itemId):
    # select right series from matrices and compute
    if userId in utility.columns and itemId in similarity.index:
        return predict_vectors(utility.loc[:,userId], similarity[itemId])
    return 0



def predict_ratings(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    #copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    #apply prediction to each row
    ratings_test_c['predicted rating'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['userId'], row['busId']), axis=1)
    return ratings_test_c

prediction = predict_ratings(sim_matrix, utility, test_set[['userId', 'busId', 'stars']])
prediction.head()

Unnamed: 0,userId,busId,stars,predicted rating
307,9hMqNBBpTl3lBy6qCAyu-A,He9Z_YfS6fucxkDTCVPv1Q,4.0,0.0
1795,9hMqNBBpTl3lBy6qCAyu-A,BSQgdjMm5pYSWuGyr-8pIA,4.0,0.0
6538,9hMqNBBpTl3lBy6qCAyu-A,EPxGfzUFrn4IHBctlfKF9g,4.0,0.0
11607,9hMqNBBpTl3lBy6qCAyu-A,fOnpoMXToa5Qiyg95E6ENg,5.0,4.197531
20688,9hMqNBBpTl3lBy6qCAyu-A,Qa_jibFadTx2iUx99yTKsw,5.0,0.0


In [42]:
def mse(predicted_ratings):
    """Computes the mean square error between actual ratings and predicted ratings
    
    Arguments:
    predicted_ratings -- a dataFrame containing the columns rating and predicted rating
    """
    predicted_ratings = predicted_ratings[predicted_ratings['predicted rating'] > 0]
    print(predicted_ratings)
    diff = predicted_ratings['stars'] - predicted_ratings['predicted rating']
    return (diff**2).mean()

mse(prediction)

                       userId                   busId  stars  predicted rating
11607  9hMqNBBpTl3lBy6qCAyu-A  fOnpoMXToa5Qiyg95E6ENg    5.0          4.197531
42538  9hMqNBBpTl3lBy6qCAyu-A  9UvteZaWHpBbqrn5lv1uUw    5.0          4.215522
81637  9hMqNBBpTl3lBy6qCAyu-A  RtnTbhWYw4QjKhoTBV37OA    4.0          4.282853
20790  SBnmBbp_2em6cdA7z0-duQ  dIA4aUlrRHthZDH5oqwuvg    2.0          4.193035
25525  SBnmBbp_2em6cdA7z0-duQ  dIA4aUlrRHthZDH5oqwuvg    4.0          4.193035
108    nWh05uXTfV-L7s3iZon4Fg  gA9hCYY7MYl9oZ3aym5dvw    1.0          4.126243
10455  nWh05uXTfV-L7s3iZon4Fg  eSENXDnnNSo-MLLkWJQ8Vg    5.0          3.781609
44966  nWh05uXTfV-L7s3iZon4Fg  9UvteZaWHpBbqrn5lv1uUw    3.0          3.767869
71549  nWh05uXTfV-L7s3iZon4Fg  GPVHfoDjSOivqgIDjIOA-Q    4.0          3.755235
87323  nWh05uXTfV-L7s3iZon4Fg  ojGCRzEym2W3tSTo69fhxA    4.0          3.757891
43463  PdgpUK6fHArEXsPcKFHXOg  URwhWyzVmQLobOXDql43Dg    5.0          3.659996
46808  PdgpUK6fHArEXsPcKFHXOg  vKKSF1_1JNnRMaoNPQb5L

1.1312642769017263

In [41]:
def mse_random(prediction):
    random_list = []
    df_copy = prediction.copy()

    #iterate over the series and add values between 0.5 and 5 with an interval of 0.5  
    for x in df_copy.values:
        random_list.append(np.random.choice(np.arange(0.5 , 5.5 , 0.5)))

    #add listvalues to series
    df_copy['predicted rating'] = random_list
    #calculate the mean squared error 
    mse_random = mse(df_copy) 
    
    return mse_random
mse_random(prediction)

4.405464480874317