In [52]:
from __future__ import division, print_function
from collections import Counter
import pandas as pd
import math


In [3]:
user_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

# Overview
print(len(user_interests))
c = 0
for inter in user_interests:
    for ele in inter:
        c += 1
print(c)
    

15
67


In [11]:
# Recommend popular items

popular_interests = Counter(interest
                            for user_interest in user_interests
                            for interest in user_interest).most_common()
# top 10 most popular items
popular_interests[:10]

def most_popular(user_interests, max_res=5):
    suggestions = [(item, frequency) 
                   for item, frequency in popular_interests
                   if item not in user_interests]
    return suggestions[:max_res]

In [13]:
# recommend for user 3
# user 3 interests: ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"]
most_popular(user_interests[2])

[('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3), ('statistics', 3)]

In [32]:
# helper method dot product
def dot(v, w):
    """gets the dot product of two vectors which is the sum of their componentwise products
    Args:
        v: vector
        w: vector
    retrun:
        sum of their componentwise products: v_1*w_1 + ... + v_n*w_n
    """
    return sum(v_i*w_i for v_i, w_i in zip(v,w))

## user based collaborative filtering


In [37]:
# cosine similarity
# similar users will mean users interests vector most nearly point in the same direction
# range of [0: with no identical interest, 1 all interests are same]

# it measures the angle btw v and w
# v[i] would be 1 if user specified ith interest, otherwise 0
def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

In [38]:
# step 1: collect all interests and assign indices to them
# use set comprehension to find the unique interest - put in a list - and sort
users_interest = [] # list of lists
unique_interests = sorted(list({
    interest
    for user_interest in users_interest
    for interest in user_interest
}))

In [49]:
# step 2: produce 0s and 1s interest vector for each user
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose ith element
    is 1 if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0 for interest in unique_interests]
        

In [50]:
# step 3: create a matrix of user interests simply by mapping this func
# to the list of user_interests
# user_interst_matrix[i][j] equals 1 if user i specified interest j, 0 otherwise

user_interst_matrix = map(make_user_interest_vector, user_interests)

In [53]:
# step 4: compute pairwise similarities btw all of the users
user_similaritites = [[cosine_similarity(interest_vector_i, interest_vector_j)
    for interest_vector_j in user_interst_matrix]
    for interest_vector_i in user_interst_matrix]

ZeroDivisionError: float division by zero

In [61]:
user_similarities = []
def most_similar_user_to(user_id):
    # find other users with non-zero similarity
    pairs = [(other_user_id, similarity) 
             for other_user_id, similarity in enumerate(user_similarities[user_id])
             if other_user_id != user_id and similarity > 0]
    
    return sorted(pairs, key= lambda x:x[1], reverse=True)

In [66]:
import operator

def user_based_suggestions(user_id, include_current_interests=False):
    # sum up item based on user similarities 
    suggestions = defaultdict(float) # collections.defaultdict - 
    for other_user_id, similarity in most_similar_user_to(user_id):
        for interest in users_interest[other_uer_id]:
            suggestions[interest] += similarity
    
    # convert similarities to a sorted list
    suggestions = sorted(suggestions.items(), key=operator.itemgetter(1), reverse=True ) # dict is not sortable
    # exclude already-have interests
    if include_current_interests:
        return suggestions
    # return suggestions (sorted items)
    else:
        # list of tuples
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interest[user_id]] # only if an item not an interest of this user

## Item based Collaborative Filtering

In [73]:
# generate suggestions for each user by aggregating interests that
# are similar to his current interests

# rows: interest, cols: user
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                         for j, _ in enumerate(unique_interests)]
# row j of interest_user_matrix is column j of user_interest_matrix
# 1 for each user with that interst, 0 for user without that interest

# if precisely the same users are interested in two topics, their similaritites will be one
# if no two users are interested in both topics, their similarity are 0

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_i in interest_user_matrix]
                          for user_vector_j in interest_user_matrix]

In [77]:
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if other_interest_id != interest_id and similarity > 0]
    
    return sorted(pairs, key=operator.itemgetter(1), reverse=True)

In [None]:
def item_based_suggestions(user_id, include_current_interest=False):
    # add up the similar interests
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, if_interested in enumerate(user_interest_vector):
        if if_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity
                
    # sort interests by similarity - sort the dict by similarity, return list
    suggestions = sorted(suggestions.items(), key=operator.itemgetter(1), reverse=True)
    
    # preclude already-have interests
    if include_current_interest:
        return suggestions
    # return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interest[user_id]]