In [1]:
import numpy as np
import pandas as pd

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year']

0       (1995)
1       (1995)
2       (1995)
3       (1995)
4       (1995)
         ...  
9737    (2017)
9738    (2017)
9739    (2017)
9740    (2018)
9741    (1991)
Name: year, Length: 9742, dtype: object

In [5]:
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['year']

0       1995
1       1995
2       1995
3       1995
4       1995
        ... 
9737    2017
9738    2017
9739    2017
9740    2018
9741    1991
Name: year, Length: 9742, dtype: object

In [6]:
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title']

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


0                                Toy Story 
1                                  Jumanji 
2                         Grumpier Old Men 
3                        Waiting to Exhale 
4              Father of the Bride Part II 
                       ...                 
9737    Black Butler: Book of the Atlantic 
9738                 No Game No Life: Zero 
9739                                 Flint 
9740          Bungo Stray Dogs: Dead Apple 
9741          Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

In [7]:
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [8]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [9]:
movies_df = movies_df.drop('genres', 1)
movies_df.head()

  movies_df = movies_df.drop('genres', 1)


Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [10]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [11]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

  ratings_df = ratings_df.drop('timestamp', 1)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [13]:
def CollaborativeRecommend(userInput):
    inputMovies = pd.DataFrame(userInput)
    inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
    inputMovies = pd.merge(inputId, inputMovies)
    inputMovies = inputMovies.drop('year', 1)
    userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
    userSubsetGroup = userSubset.groupby(['userId'])
    userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
    return userSubsetGroup,inputMovies

In [14]:
def findSimilarity(userSubsetGroup,inputMovies):
    userSubsetGroup = userSubsetGroup[0:100]
    pearsonCorrelationDict = {}   
    for name, group in userSubsetGroup:
        group = group.sort_values(by='movieId')
        inputMovies = inputMovies.sort_values(by='movieId')
        nRatings = len(group)
        temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
        tempRatingList = temp_df['rating'].tolist()
        tempGroupList = group['rating'].tolist()
        Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
        Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
        Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
        if Sxx != 0 and Syy != 0:
            pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
        else:
            pearsonCorrelationDict[name] = 0   
    pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    pearsonDF.columns = ['similarityIndex']
    pearsonDF['userId'] = pearsonDF.index
    pearsonDF.index = range(len(pearsonDF))
    topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
    topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
    topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
    tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
    tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
    recommendation_df = pd.DataFrame()
    recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
    recommendation_df['movieId'] = tempTopUsersRating.index
    return recommendation_df

In [15]:
def recommendMovies(recommendation_df):
    recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
    recommendations = movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(5)['movieId'].tolist())]

    return recommendations

In [16]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':1},
            {'title':'Toy Story', 'rating':1.5},
            {'title':'Jumanji', 'rating':5},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':1.5}
         ] 
userSubsetGroup,inputMovies = CollaborativeRecommend (userInput)
recommendation_df = findSimilarity(userSubsetGroup,inputMovies)
recommendations = recommendMovies(recommendation_df)
recommendations

  inputMovies = inputMovies.drop('year', 1)


Unnamed: 0,movieId,title,year
2424,3224,Woman in the Dunes (Suna no onna),1964
2597,3473,Jonah Who Will Be 25 in the Year 2000 (Jonas q...,1976
3348,4535,"Man from Snowy River, The",1982
4095,5867,Thief,1981
6110,42730,Glory Road,2006


In [17]:
import pickle

In [18]:
pickle.dump(recommendation_df.to_dict(),open('collaborative_dict.pkl','wb'))

In [19]:
pwd

'C:\\Users\\Parangat\\Collaborative Based Approach\\collaborative based'