In [1]:
# Data mining studies 
# Recommendation systems

# sample slice from a music streaming website rating csv
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
        }

In [2]:
def manhattan(rating1, rating2):
    distance = 0
    for key in rating1:     
        if key in rating2:
            distance += abs(rating1[key]-rating2[key])
    return distance

In [6]:
manhattan(users["Hailey"], users["Veronica"])

2.0

In [7]:
def computeNearestNeighbor(username, users):
    distances = []
    for user in users:
        if user != username:
            distance = manhattan(users[user], users[username])
            distances.append((distance,user))
    distances.sort()
    return distances

In [30]:
computeNearestNeighbor("Hailey", users)

[(2.0, 'Veronica'),
 (4.0, 'Chan'),
 (4.0, 'Sam'),
 (4.5, 'Dan'),
 (5.0, 'Angelica'),
 (5.5, 'Bill'),
 (7.5, 'Jordyn')]

In [33]:
import pandas as pd

users_pd = pd.Series(users, name="Users")
users_pd.describe()

count                                                     8
unique                                                    8
top       {'Broken Bells': 4.5, 'Deadmau5': 4.0, 'Norah ...
freq                                                      1
Name: Users, dtype: object

In [34]:
def recommend(username, users):
    nearest = computeNearestNeighbor(username, users)[0][1]
    recommendations = []
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    return sorted(recommendations, 
                  key=lambda artistTuple: artistTuple[1],
                  reverse=True)

In [49]:
recommend('Hailey', users)

[('Phoenix', 4.0), ('Blues Traveler', 3.0), ('Slightly Stoopid', 2.5)]

In [50]:
#use if data is dense + magnitude of attribute values is important
def Minkowski(rating1, rating2, r):
    distance = 0
    common_ratings = False
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key] - rating2[key]), r)
            common_ratings = True
    if common_ratings:
        return pow(distance, 1/r)
    else:
        return 0

In [51]:
def alteredCNN(username, users, r):
    distance_list = []
    for user in users:
        if user != username:
            distance = Minkowski(users[user], users[username], r)
            distance_list.append(distance)
    distance_list.sort()
    return distance_list


In [54]:
Minkowski(users["Veronica"], users["Chan"], 2)
alteredCNN("Veronica", users, 2)

[1.4142135623730951,
 1.6583123951777,
 2.449489742783178,
 2.449489742783178,
 2.449489742783178,
 3.3541019662496847,
 3.905124837953327]

In [58]:
#how similar are the two data?
#use if the data is subject to grade-inflation
def pearsonCorrelation(rating1, rating2):
    numerator = 0
    x = 0
    y = 0
    x_sqrd = 0
    y_sqrd = 0
    count = 0
    for key in rating1:
        if key in rating2:
            numerator += (rating1[key]*rating2[key])
            x += rating1[key]
            y += rating2[key]
            x_sqrd += pow(rating1[key],2)
            y_sqrd += pow(rating2[key],2)
            count += 1
    if count == 0:
        return 0
    numerator = numerator - (x*y)/count
    denominator1 = pow((x_sqrd - (pow(x,2))/count), 1/2) 
    denominator2 = pow((y_sqrd - (pow(y,2))/count), 1/2)
    return numerator / (denominator1 * denominator2)
            
            

In [60]:
pearsonCorrelation(users["Angelica"],users["Bill"])
pearsonCorrelation(users["Angelica"],users["Hailey"])

0.42008402520840293

In [71]:
#ignores 0-0 matchings
#use if data is sparse
def cosineSimiliarity(rating1, rating2):
    numerator = 0
    x_length = 0
    y_length = 0
    for key in rating1:
        if key in rating2:
            numerator += rating1[key] * rating2[key]
            x_length += pow(rating1[key],2)
            y_length += pow(rating2[key],2)
    x_length = pow(x_length,1/2)
    y_length = pow(y_length,1/2)
    return numerator/(x_length*y_length)
            

In [72]:
cosineSimiliarity(users["Angelica"],users["Veronica"])

0.9790636038900089