In [1]:
"""
1.Remove or otherwise handle movies with few ratings. A movie with less than a certain threshold number of ratings will likely fail to develop appropriate features.
2.Restructuring the existing data set from a list of user ID, movie ID, and ratings to a matrix containing the ratings from a user for a specific movie.
3.Apply mean normalization to all of the ratings by movie.
4.Applying a collaborative filtering algorithm with gradient descent should be able to resolve features for both the movies and the users.
5.Use features to form recommendations by either predicting a user’s rating or determining similarity between movies.
"""

'\n1.Remove or otherwise handle movies with few ratings. A movie with less than a certain threshold number of ratings will likely fail to develop appropriate features.\n2.Restructuring the existing data set from a list of user ID, movie ID, and ratings to a matrix containing the ratings from a user for a specific movie.\n3.Apply mean normalization to all of the ratings by movie.\n4.Applying a collaborative filtering algorithm with gradient descent should be able to resolve features for both the movies and the users.\n5.Use features to form recommendations by either predicting a user’s rating or determining similarity between movies.\n'

In [11]:
import pandas as pd
import numpy

In [3]:
ratings = pd.DataFrame.from_csv('ratings.csv',index_col=None)
#movies = pd.DataFrame.from_csv('movies.csv',index_col=None)

In [4]:
ratings.userId.value_counts().tail() #Check minimum number of ratings submitted by users

645    20
58     20
497    20
313    20
350    20
Name: userId, dtype: int64

In [5]:
ratings.movieId.value_counts().tail() #Check minimum number of ratings per movie

8612     1
61950    1
2593     1
8740     1
2049     1
Name: movieId, dtype: int64

In [6]:
#Step 1: Remove movies with less ratings than threshold = 10
r=ratings
r['movie_freq'] = r.groupby('movieId')['movieId'].transform('count')
r = r[r.movie_freq>=10]
r

Unnamed: 0,userId,movieId,rating,timestamp,movie_freq
0,1,16,4.0,1217897793,84
1,1,24,1.5,1217895807,38
2,1,32,4.0,1217896246,207
3,1,47,4.0,1217896556,196
4,1,50,4.0,1217896523,228
5,1,110,4.0,1217896150,248
6,1,150,3.0,1217895940,197
7,1,161,4.0,1217897864,107
8,1,165,3.0,1217897135,150
9,1,204,0.5,1217895786,34


In [7]:
#Step 2: Restructure into a matrix
s = r.pivot('movieId','userId','rating')
s

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,5,,,4,,,5,,,...,,,4,5,3,,,,,3.0
2,,,,,,,,,,,...,,,,5,,,,,,3.0
3,,2,,,,,,4,3,,...,,,3,,,,,,,2.0
4,,,,,,,,,,,...,,,,,,,,,,
5,,3,3,,,,,3,,,...,,,3,,,,,,,2.5
6,,,,,,,,,4,,...,2.5,,4,5,,,,,,5.0
7,,,3,,,,,,,,...,3.5,,4,,,,,,,3.0
9,,,,,,,,,,,...,,,3,,,,,,,
10,,,,,,,4,,3,,...,,,,,,,,,,3.5
11,,,4,,,,,,,,...,,,,,,,,,,3.0


In [8]:
#Alternate Step 2
s = r[['userId','movieId','rating']]
s = s.set_index(['movieId','userId'])
s = s.unstack('userId')
s

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,5,,,4,,,5,,,...,,,4,5,3,,,,,3.0
2,,,,,,,,,,,...,,,,5,,,,,,3.0
3,,2,,,,,,4,3,,...,,,3,,,,,,,2.0
4,,,,,,,,,,,...,,,,,,,,,,
5,,3,3,,,,,3,,,...,,,3,,,,,,,2.5
6,,,,,,,,,4,,...,2.5,,4,5,,,,,,5.0
7,,,3,,,,,,,,...,3.5,,4,,,,,,,3.0
9,,,,,,,,,,,...,,,3,,,,,,,
10,,,,,,,4,,3,,...,,,,,,,,,,3.5
11,,,4,,,,,,,,...,,,,,,,,,,3.0


In [27]:
#Step 3: Mean Normalization
mu = s.mean(axis = 1)
t = s.subtract(mu,axis=0).fillna(0)
t

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,0.000000,1.092672,0.000000,0.000000,0.092672,0,0.000000,1.092672,0.000000,0,...,0.000000,0,0.092672,1.092672,-0.907328,0,0.000000,0.000000,0.000000,-0.907328
2,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0,...,0.000000,0,0.000000,1.646739,0.000000,0,0.000000,0.000000,0.000000,-0.353261
3,0.000000,-1.189655,0.000000,0.000000,0.000000,0,0.000000,0.810345,-0.189655,0,...,0.000000,0,-0.189655,0.000000,0.000000,0,0.000000,0.000000,0.000000,-1.189655
4,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0,...,0.000000,0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000
5,0.000000,-0.250000,-0.250000,0.000000,0.000000,0,0.000000,-0.250000,0.000000,0,...,0.000000,0,-0.250000,0.000000,0.000000,0,0.000000,0.000000,0.000000,-0.750000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.000000,-0.073913,0,...,-1.573913,0,-0.073913,0.926087,0.000000,0,0.000000,0.000000,0.000000,0.926087
7,0.000000,0.000000,-0.381818,0.000000,0.000000,0,0.000000,0.000000,0.000000,0,...,0.118182,0,0.618182,0.000000,0.000000,0,0.000000,0.000000,0.000000,-0.381818
9,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0,...,0.000000,0,0.130435,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000
10,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.400000,0.000000,-0.600000,0,...,0.000000,0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,-0.100000
11,0.000000,0.000000,0.270588,0.000000,0.000000,0,0.000000,0.000000,0.000000,0,...,0.000000,0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,-0.729412
