In [3]:
import pandas as pd

In [4]:
training = pd.read_csv('data/training.csv')

In [5]:
users = pd.read_csv('data/users.dat', delimiter='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip'])

  """Entry point for launching an IPython kernel.


In [6]:

#testing = pd.read_csv('data/fake_testing.csv')
#movies = pd.read_csv('data/movies.dat', delimiter='::', names=['MovieId', 'Title', 'Genres'])
movies = pd.read_csv('data/movies.csv')

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movies['genres'] = movies['genres'].str.replace('|', " ")

In [10]:
movies.set_index('title', inplace = True)

In [11]:
movies.head()

Unnamed: 0_level_0,movieId,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1,Adventure Animation Children Comedy Fantasy
Jumanji (1995),2,Adventure Children Fantasy
Grumpier Old Men (1995),3,Comedy Romance
Waiting to Exhale (1995),4,Comedy Drama Romance
Father of the Bride Part II (1995),5,Comedy


In [12]:
movies.head()

Unnamed: 0_level_0,movieId,genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1,Adventure Animation Children Comedy Fantasy
Jumanji (1995),2,Adventure Children Fantasy
Grumpier Old Men (1995),3,Comedy Romance
Waiting to Exhale (1995),4,Comedy Drama Romance
Father of the Bride Part II (1995),5,Comedy


In [201]:
val=movies.groupby('genres').count()


In [202]:
import matplotlib.pyplot as plt

In [2]:
#fig,ax=plt.subplots()
#ax.hist(val)

In [13]:
movies.shape

(62423, 2)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

class ItemRecommender():
    '''
    Content based item recommender
    '''
    def __init__(self, similarity_measure=None):
        self.similarity_matrix = None
        self.item_names = None

        if similarity_measure == None:
            self.similarity_measure = cosine_similarity
        else:
            self.similarity_measure = similarity_measure

    
    def fit(self, X, index=None):
        '''
        Takes a numpy array of the item attributes and creates the similarity matrix
        INPUT -
            X: NUMPY ARRAY - Rows are items, columns are feature values / or DF
            index: LIST - List of the item names/titles in order of the numpy arrray
        
        OUTPUT - None
        Notes:  You might want to keep titles and X as attributes to refer to them later
        Create the a similarity matrix of item to item similarity
        '''

        # While keeping this as a sparse matrix would be best the cosign sim
        # function returns a array so there is no reason.
        
        if isinstance(X, pd.DataFrame):
            self.item_counts = X
            self.item_names = X.index
            self.similarity_df = pd.DataFrame(self.similarity_measure(X.values, X.values),
                 index = self.item_names)
        else:
            self.item_counts = X
            self.similarity_df = pd.DataFrame(self.similarity_measure(X, X),
                 index = index)
            self.item_names = self.similarity_df.index

        
    def get_recommendations(self, item, n=5):
        '''
        Returns the top n items related to the item passed in
        INPUT:
            item    - STRING - Name of item in the original DataFrame 
            n       - INT    - Number of top related items to return 
        OUTPUT:
            items - List of the top n related item names
        For a given item find the n most similar items to it (this can be done using the similarity matrix created in the fit method)
        '''
        return self.item_names[self.similarity_df.loc[item].values.argsort()[-(n+1):-1]].values[::-1]


    def get_user_profile(self, items):
        '''
        Takes a list of items and returns a user profile. A vector representing the likes of the user.
        INPUT: 
            items  -   LIST - list of movie names user likes / has seen
        OUTPUT: 
            user_profile - NP ARRAY - array representing the likes of the user 
                    The columns of this will match the columns of the trained on matrix
    
        Using the list of items liked by the user create a profile which will be a 1 x number of features array.  This should be the addition of the values for all liked item features (you can choose how to normalize if you think it is needed)
        '''
        user_profile = np.zeros(self.item_counts.shape[1])
        for item in items:
            user_profile += self.item_counts.loc[item].values

        return user_profile


    def get_user_recommendation(self, items, n=5):
        '''
        Takes a list of movies user liked and returns the top n items for that user
        INPUT 
            items  -   LIST - list of movie names user likes / has seen
            n -  INT - number of items to return
        OUTPUT 
            items - LIST - n recommended items
    
        Make use of the get_user_profile method to create a user profile that will be used to get the similarity to all items and recommend the top n.
        '''
        num_items = len(items)
        user_profile = self.get_user_profile(items)

        user_sim =  self.similarity_measure(self.item_counts, user_profile.reshape(1,-1))

        return self.item_names[user_sim[:,0].argsort()[-(num_items+n):-num_items]].values[::-1]


In [15]:
count = CountVectorizer()
count_matrix = count.fit_transform(movies['genres'])

In [16]:
indices = pd.Series(movies.index)

In [17]:
rec = ItemRecommender()
count_df = pd.DataFrame(count_matrix.todense(), index=indices.values)

In [18]:
count_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
Toy Story (1995),0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Jumanji (1995),0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
count_df.shape

(62423, 24)

In [20]:
rec.fit(count_df)

In [21]:
print(rec.get_recommendations('Pompatus of Love, The (1996)'))

['My Father Is on the Tree (1969)' 'Midvinterduell (1983)'
 'Dalecarlians (2004)'
 'Marie-Jo and Her 2 Lovers (Marie-Jo et ses 2 amours) (2002)'
 'The Thursday (1964)']


In [22]:
profile = rec.get_user_profile(['Toy Story (1995)','Jumanji (1995)'])

In [23]:
profile

array([0., 2., 1., 2., 1., 0., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [26]:
profile[1]

2.0

In [27]:
count.get_feature_names()[1]

'adventure'

In [30]:
print(rec.get_user_recommendation(['Toy Story (1995)','Jumanji (1995)']))

['Penguin Highway (2018)' 'Antz (1998)' 'The Good Dinosaur (2015)'
 'Asterix and the Vikings (Astérix et les Vikings) (2006)'
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)']


In [113]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [114]:
cv = CountVectorizer()

In [115]:
count_matrix = cv.fit_transform(movies["Genres"])

In [116]:
cosine_sim = cosine_similarity(count_matrix) 
movie_user_likes = "Avatar"

In [119]:
def get_title_from_index(index):
    return movies[movies.MovieId == index]["Title"]
def get_index_from_title(movie_user_likes):
    return movies[movies['Title'] ==movie_user_likes]["MovieId"]

In [118]:
## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)
similar_movies =  list(enumerate(cosine_sim[movie_index]))

## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

## Step 8: Print titles of first 50 movies
i=0
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>50:
        break

AttributeError: 'DataFrame' object has no attribute 'Title'