## Load all the Python Library

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
import warnings
warnings.filterwarnings('ignore')

### Load the Data.
##### I have copied the data into the Local Drive, before load into the variable via pandas

In [3]:
# Reading ratings file
ratings = pd.read_csv('../data/ratings.dat', sep='::', encoding='latin-1', names=['userId','movieId','rating','timestamp'])

# Reading movies file
movies = pd.read_csv('../data/movies.dat', sep='::', encoding='latin-1', names=['movieId','title','genres'])

### Size of the Datasets Loaded (Printing)

In [17]:
print ("Size of the ratings dataset: {}".format(ratings.shape))
ratings.head(5)

Size of the ratings dataset: (1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [18]:
print ("Size of the movies dataset: {}".format(movies.shape))
movies.head(5)

Size of the movies dataset: (3883, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Merge the datasets of movies and rtatings dataframe

In [19]:
df_movies_ratings=pd.merge(movies, ratings,how="inner",on="movieId")
print ("Size of the movies dataset: {}".format(df_movies_ratings.shape))
df_movies_ratings.head(5)

Size of the movies dataset: (1000209, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


# SYSTEM - I

## SCHEME - I (High Rated)

Propose the User Top 10 Movies (High Rated) against the input Genres 


The Top 10 High Rated Movies are based on the "Rating". So once the "genres" has been input, we will search for all the movies against the respective "genres". We will then calculate average all the rating (mentioned by all the users) against the movie id, then sort it. We will show the Top 10 Movies which have the highest average rating.

### Input the Genres (From the User)

In [58]:
input_genres = 'Comedy'
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres_grouped = df_movies_ratings_genres.groupby(['movieId','title']).mean().reset_index()

### Extract Movies and Average the Rating 

Extract all the Movies, UserIds, Ratings against the Input Genres

Then stored the Average the "Rating" against each movideId (& Title) 

In [61]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres_grouped = df_movies_ratings_genres.groupby(['movieId','title']).mean().reset_index()
df_movies_ratings_genres_grouped_Top_10 = df_movies_ratings_genres_grouped.sort_values(['rating'],ascending=False)[1:10]
df_movies_ratings_genres_grouped_Top_10

Unnamed: 0,movieId,title,userId,rating,timestamp
384,3233,Smashing Time (1967),3733.0,5.0,966424600.0
354,3022,"General, The (1927)",3124.402913,4.368932,970797700.0
140,1136,Monty Python and the Holy Grail (1974),3081.123202,4.33521,972326400.0
348,2937,"Palm Beach Story, The (1942)",3135.519231,4.288462,972401600.0
122,905,It Happened One Night (1934),3083.417112,4.280749,971160600.0
81,598,Window to Paris (1994),3024.5,4.25,970834200.0
129,1002,Ed's Next Move (1996),3208.875,4.25,970706000.0
127,951,His Girl Friday (1940),3001.770781,4.24937,971008500.0
420,3462,Modern Times (1936),3250.036066,4.236066,970586400.0


### Show the Top 10 Movies

In [66]:
df_movies_ratings_genres_grouped_Top_10['title']

384                      Smashing Time (1967)
354                       General, The (1927)
140    Monty Python and the Holy Grail (1974)
348              Palm Beach Story, The (1942)
122              It Happened One Night (1934)
81                     Window to Paris (1994)
129                     Ed's Next Move (1996)
127                    His Girl Friday (1940)
420                       Modern Times (1936)
Name: title, dtype: object

### Based on the Input Genres, search the Movie and Sort based on the Rating and Pulled the Top 10 Movies

In [None]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres.sort_values(by="rating",ascending=False)[1:10]

In [None]:
### Top 10 Highly Rated Movies Based on the 

In [None]:
df_movies_ratings_genres = df_movies_ratings[df_movies_ratings['genres'] == 'Comedy']
df_movies_ratings_genres.sort_values(by="rating",ascending=False)[1:10]

In [21]:
df_movies_ratings['genres'].value_counts()

Comedy                       116883
Drama                        111423
Comedy|Romance                42712
Comedy|Drama                  42245
Drama|Romance                 29170
                              ...  
Drama|Romance|Western            29
Children's|Fantasy               27
Comedy|Film-Noir|Thriller         5
Film-Noir|Horror                  2
Fantasy                           1
Name: genres, Length: 301, dtype: int64

In [172]:
ratings_matrix_items = df_movies_ratings.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
ratings_matrix_items.fillna( 0, inplace = True )
ratings_matrix_items.shape

(3706, 6040)

In [173]:
ratings_matrix_items

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
movie_similarity = 1 - pairwise_distances( ratings_matrix_items.to_numpy(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_items = pd.DataFrame( movie_similarity )
ratings_matrix_items

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,0.000000,0.390349,0.267943,0.178789,0.256569,0.347373,0.301490,0.125709,0.106620,0.377459,...,0.099502,0.020966,0.084105,0.081826,0.045949,0.309676,0.186633,0.093479,0.042829,0.182691
1,0.390349,0.000000,0.240946,0.155457,0.249970,0.244827,0.262772,0.196521,0.158469,0.386200,...,0.061819,0.015209,0.075310,0.095573,0.074271,0.213650,0.140781,0.087013,0.026063,0.122185
2,0.267943,0.240946,0.000000,0.192788,0.308290,0.187020,0.292230,0.092122,0.128378,0.245601,...,0.038492,0.065507,0.049512,0.087377,0.050985,0.190575,0.104837,0.062258,0.010073,0.097786
3,0.178789,0.155457,0.192788,0.000000,0.271990,0.125170,0.220024,0.049554,0.060334,0.133707,...,0.055486,0.053300,0.002227,0.025278,0.025204,0.118902,0.096318,0.022588,0.024769,0.095154
4,0.256569,0.249970,0.308290,0.271990,0.000000,0.148114,0.305107,0.095512,0.138392,0.237681,...,0.026632,0.083898,0.046399,0.047542,0.016156,0.174554,0.092403,0.051633,0.010750,0.112835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3701,0.309676,0.213650,0.190575,0.118902,0.174554,0.236447,0.191689,0.090387,0.092347,0.237227,...,0.183859,0.053539,0.109062,0.210272,0.078341,0.000000,0.329339,0.168234,0.122279,0.363838
3702,0.186633,0.140781,0.104837,0.096318,0.092403,0.201419,0.117660,0.080523,0.099554,0.136374,...,0.244371,0.098568,0.070933,0.160150,0.107063,0.329339,0.000000,0.302649,0.199337,0.347805
3703,0.093479,0.087013,0.062258,0.022588,0.051633,0.115331,0.059262,0.084976,0.004956,0.097170,...,0.126068,0.211891,0.057350,0.124186,0.095905,0.168234,0.302649,0.000000,0.202809,0.234638
3704,0.042829,0.026063,0.010073,0.024769,0.010750,0.029136,0.036102,0.072141,0.000000,0.018359,...,0.170983,0.132019,0.086057,0.104873,0.015847,0.122279,0.199337,0.202809,0.000000,0.192972


In [176]:
def item_similarity(movieName): 
    """
    recomendates similar movies
   :param data: name of the movie 
   """
    try:
        #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
        user_inp=movieName
        inp=df_movies[df_movies['title']==user_inp].index.tolist()
        inp=inp[0]

        df_movies['similarity'] = ratings_matrix_items.iloc[inp]
        df_movies.columns = ['movie_id', 'title', 'release_date','similarity']
    except:
        print("Sorry, the movie is not in the database!")

In [296]:
def recommendedMoviesAsperItemSimilarity(user_id):
    """
     Recommending movie which user hasn't watched as per Item Similarity
    :param user_id: user_id to whom movie needs to be recommended
    :return: movieIds to user 
    """
    user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['title']]
    user_movie=user_movie.iloc[0,0]
    item_similarity(user_movie)
    print ("Coming...")
    
    sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )
    
    sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']
    
    recommended_movies=list()
    
    df_recommended_item=pd.DataFrame()
    user2Movies= df_ratings[df_ratings['userId']== user_id]['movieId']
    
    for movieId in sorted_movies_as_per_userChoice:
        if movieId not in user2Movies:
            print ("Coming 2..")
            df_new= df_ratings[(df_ratings.movieId==movieId)]
            df_recommended_item=pd.concat([df_recommended_item,df_new])
            print (df_recommended_item.shape)
        print ("Coming 3..")
        #return (df_recommended_item)
    best10=df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 
    return best10['movieId']

In [181]:
def movieIdToTitle(listMovieIDs):
    """
     Converting movieId to titles
    :param user_id: List of movies
    :return: movie titles
    """
    movie_titles= list()
    for id in listMovieIDs:
        movie_titles.append(df_movies[df_movies['movie_id']==id]['title'])
    return movie_titles

In [273]:
user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['title']]
#user_movie=user_movie.iloc[0,0]
#item_similarity(user_movie)

In [276]:
user_movie=user_movie.iloc[0,0]

In [277]:
item_similarity(user_movie)

In [278]:
sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )

In [280]:
sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']

In [282]:
recommended_movies=list()

In [281]:
sorted_movies_as_per_userChoice

2898    2967
1173    1190
574      578
2162    2231
1178    1196
        ... 
573      577
627      632
963      975
1613    1659
1186    1204
Name: movie_id, Length: 87, dtype: int64

In [283]:
df_recommended_item=pd.DataFrame()

In [284]:
user2Movies= df_ratings[df_ratings['userId']== user_id]['movieId']

In [289]:
for movieId in sorted_movies_as_per_userChoice:
    print ("Ok 1:{}",format(movieId))
    if movieId not in user2Movies:
        print ("Ok 2:{}",format(movieId))
        df_new= df_ratings[(df_ratings.movieId==movieId)]
        df_recommended_item=pd.concat([df_recommended_item,df_new])

Ok 1:{} 2967
Ok 1:{} 1190
Ok 2:{} 1190
Ok 1:{} 578
Ok 2:{} 578
Ok 1:{} 2231
Ok 2:{} 2231
Ok 1:{} 1196
Ok 2:{} 1196
Ok 1:{} 34
Ok 2:{} 34
Ok 1:{} 1122
Ok 2:{} 1122
Ok 1:{} 1477
Ok 2:{} 1477
Ok 1:{} 350
Ok 2:{} 350
Ok 1:{} 2443
Ok 2:{} 2443
Ok 1:{} 358
Ok 2:{} 358
Ok 1:{} 1123
Ok 2:{} 1123
Ok 1:{} 256
Ok 2:{} 256
Ok 1:{} 2272
Ok 2:{} 2272
Ok 1:{} 1124
Ok 2:{} 1124
Ok 1:{} 585
Ok 2:{} 585
Ok 1:{} 1806
Ok 2:{} 1806
Ok 1:{} 312
Ok 2:{} 312
Ok 1:{} 447
Ok 2:{} 447
Ok 1:{} 1136
Ok 2:{} 1136
Ok 1:{} 2661
Ok 2:{} 2661
Ok 1:{} 2580
Ok 2:{} 2580
Ok 1:{} 1917
Ok 2:{} 1917
Ok 1:{} 1038
Ok 2:{} 1038
Ok 1:{} 579
Ok 2:{} 579
Ok 1:{} 3098
Ok 1:{} 2779
Ok 2:{} 2779
Ok 1:{} 583
Ok 2:{} 583
Ok 1:{} 470
Ok 2:{} 470
Ok 1:{} 2626
Ok 2:{} 2626
Ok 1:{} 1075
Ok 2:{} 1075
Ok 1:{} 290
Ok 2:{} 290
Ok 1:{} 2197
Ok 2:{} 2197
Ok 1:{} 2720
Ok 2:{} 2720
Ok 1:{} 1422
Ok 2:{} 1422
Ok 1:{} 39
Ok 2:{} 39
Ok 1:{} 1543
Ok 2:{} 1543
Ok 1:{} 1664
Ok 2:{} 1664
Ok 1:{} 597
Ok 2:{} 597
Ok 1:{} 869
Ok 2:{} 869
Ok 1:{} 2062
Ok 2:{}

In [279]:
sorted_movies_as_per_userChoice

Unnamed: 0,movie_id,title,release_date,similarity
2898,2967,"Bad Seed, The (1956)",Drama|Thriller,0.633104
1173,1190,Tie Me Up! Tie Me Down! (1990),Drama,0.610826
574,578,"Hour of the Pig, The (1993)",Drama|Mystery,0.605849
2162,2231,Rounders (1998),Crime|Drama,0.579382
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,0.570125
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,
3879,3949,Requiem for a Dream (2000),Drama,
3880,3950,Tigerland (2000),Drama,
3881,3951,Two Family House (2000),Drama,


In [271]:
df_recommended_item

In [297]:
df_recommended_item.sort_values(["rating"], ascending = False )[1:10]

KeyError: 'rating'

In [294]:
user_id

23

In [298]:
df_recommended_item = recommendedMoviesAsperItemSimilarity(user_id)

Coming...
Coming 3..
Coming 2..
(132, 4)
Coming 3..
Coming 2..
(134, 4)
Coming 3..
Coming 2..
(479, 4)
Coming 3..
Coming 2..
(3469, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5247, 4)
Coming 3..
Coming 2..
(5762, 4)
Coming 3..
Coming 2..
(5888, 4)
Coming 3..
Coming 2..
(6029, 4)
Coming 3..
Coming 2..
(6062, 4)
Coming 3..
Coming 2..
(6280, 4)
Coming 3..
Coming 2..
(6392, 4)
Coming 3..
Coming 2..
(6797, 4)
Coming 3..
Coming 2..
(7215, 4)
Coming 3..
Coming 2..
(7383, 4)
Coming 3..
Coming 2..
(7495, 4)
Coming 3..
Coming 2..
(7526, 4)
Coming 3..
Coming 2..
(9125, 4)
Coming 3..
Coming 2..
(9219, 4)
Coming 3..
Coming 2..
(9942, 4)
Coming 3..
Coming 2..
(11052, 4)
Coming 3..
Coming 2..
(11063, 4)
Coming 3..
Coming 2..
(11064, 4)
Coming 3..
Coming 3..
Coming 2..
(11409, 4)
Coming 3..
Coming 2..
(11437, 4)
Coming 3..
Coming 2..
(11475, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11678, 4)
Coming 3..
Co

In [257]:
df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 

Unnamed: 0,userId,movieId,rating,timestamp
915565,5535,1217,5,959579261
925147,5592,1217,5,959268781
932612,5630,1217,5,980553265
933670,5636,1217,5,959052049
936370,5649,1217,5,958865612
944923,5702,1217,5,958578730
951202,5747,1217,5,958356864
956390,5770,1217,5,958172236
962784,5805,1217,5,958107586


In [300]:
user_id=23
df_recommended_item = print("Recommended movies,:\n",movieIdToTitle(recommendedMoviesAsperItemSimilarity(user_id)))

Coming...
Coming 3..
Coming 2..
(132, 4)
Coming 3..
Coming 2..
(134, 4)
Coming 3..
Coming 2..
(479, 4)
Coming 3..
Coming 2..
(3469, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5220, 4)
Coming 3..
Coming 2..
(5247, 4)
Coming 3..
Coming 2..
(5762, 4)
Coming 3..
Coming 2..
(5888, 4)
Coming 3..
Coming 2..
(6029, 4)
Coming 3..
Coming 2..
(6062, 4)
Coming 3..
Coming 2..
(6280, 4)
Coming 3..
Coming 2..
(6392, 4)
Coming 3..
Coming 2..
(6797, 4)
Coming 3..
Coming 2..
(7215, 4)
Coming 3..
Coming 2..
(7383, 4)
Coming 3..
Coming 2..
(7495, 4)
Coming 3..
Coming 2..
(7526, 4)
Coming 3..
Coming 2..
(9125, 4)
Coming 3..
Coming 2..
(9219, 4)
Coming 3..
Coming 2..
(9942, 4)
Coming 3..
Coming 2..
(11052, 4)
Coming 3..
Coming 2..
(11063, 4)
Coming 3..
Coming 2..
(11064, 4)
Coming 3..
Coming 3..
Coming 2..
(11409, 4)
Coming 3..
Coming 2..
(11437, 4)
Coming 3..
Coming 2..
(11475, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11510, 4)
Coming 3..
Coming 2..
(11678, 4)
Coming 3..
Co

In [248]:
df_recommended_item

In [198]:
user_movie= df_movies_ratings[(df_movies_ratings.userId==user_id) & df_movies_ratings.rating.isin([5,4.5])][['title']]

In [199]:
user_movie=user_movie.iloc[0,0]

In [200]:
sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )

In [201]:
sorted_movies_as_per_userChoice=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']


In [204]:
recommended_movies=list()

In [203]:
sorted_movies_as_per_userChoice

2898    2967
1173    1190
574      578
2162    2231
1178    1196
        ... 
573      577
627      632
963      975
1613    1659
1186    1204
Name: movie_id, Length: 87, dtype: int64

In [205]:
df_recommended_item=pd.DataFrame()

In [206]:
user2Movies= df_ratings[df_ratings['userId']== user_id]['movieId']

In [208]:
for movieId in sorted_movies_as_per_userChoice:
    if movieId not in user2Movies:
        df_new= df_ratings[(df_ratings.movieId==movieId)]
        df_recommended_item=pd.concat([df_recommended_item,df_new])

In [209]:
df_recommended_item

Unnamed: 0,userId,movieId,rating,timestamp
7418,53,1190,5,977986365
18499,146,1190,3,979854470
19752,149,1190,5,981498843
24757,175,1190,4,977116246
30178,203,1190,2,976929239
...,...,...,...,...
994795,6007,1204,4,956790809
996336,6016,1204,5,956778313
997847,6026,1204,4,956726822
998221,6032,1204,5,956718542


In [211]:
best10=df_recommended_item.sort_values(["rating"], ascending = False )[1:10] 