In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras

In [2]:
data_path = os.getcwd()
dfmovie = pd.read_csv(os.path.join(data_path,"movies.csv"))
dfratings_train = pd.read_csv(os.path.join(data_path,"rating_train.csv"))
dfratings_test = pd.read_csv(os.path.join(data_path,"rating_test.csv"))
dfusers = pd.read_csv(os.path.join(data_path,"users.csv"))

In [3]:
dfmovie

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
dfratings_train

Unnamed: 0,UserID,MovieID,timestamps,Rating
0,1,1836,978300172,5
1,1,1097,978301953,4
2,1,2028,978301619,5
3,1,527,978824195,5
4,1,2918,978302124,4
...,...,...,...,...
800188,6040,2300,963272000,5
800189,6040,2241,956716541,1
800190,6040,2641,956716343,2
800191,6040,800,960972032,4


In [5]:
dfusers

Unnamed: 0,UserID,Gender,Age,OccupationID,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [6]:
dfmovieratings_users = dfratings_train.merge(dfusers,on="UserID")
dfmovieratings_users = dfmovieratings_users.merge(dfmovie, on="MovieID")
dfmovieratings_users

Unnamed: 0,UserID,MovieID,timestamps,Rating,Gender,Age,OccupationID,Zip-code,Title,Genres
0,1,1836,978300172,5,F,1,10,48067,"Last Days of Disco, The (1998)",Drama
1,8,1836,978231592,4,M,25,12,11413,"Last Days of Disco, The (1998)",Drama
2,53,1836,977987956,5,M,25,0,96931,"Last Days of Disco, The (1998)",Drama
3,187,1836,977078536,3,F,45,1,94061,"Last Days of Disco, The (1998)",Drama
4,352,1836,976333646,4,M,18,4,60115,"Last Days of Disco, The (1998)",Drama
...,...,...,...,...,...,...,...,...,...,...
800188,5717,2258,958509389,4,M,25,0,03766,Master Ninja I (1984),Action
800189,5754,2543,958272316,4,F,18,1,60640,Six Ways to Sunday (1997),Comedy
800190,5780,2845,958153068,1,M,18,17,92886,White Boys (1999),Drama
800191,5854,2895,957744257,3,M,45,7,33135,Napoleon and Samantha (1972),Adventure


In [7]:
dfmovieratings_users.drop('Title',axis = 1, inplace = True)

In [8]:
dfmovieratings_users.drop(['OccupationID','Zip-code','timestamps'],axis = 1, inplace=True)
dfmovieratings_users

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Genres
0,1,1836,5,F,1,Drama
1,8,1836,4,M,25,Drama
2,53,1836,5,M,25,Drama
3,187,1836,3,F,45,Drama
4,352,1836,4,M,18,Drama
...,...,...,...,...,...,...
800188,5717,2258,4,M,25,Action
800189,5754,2543,4,F,18,Comedy
800190,5780,2845,1,M,18,Drama
800191,5854,2895,3,M,45,Adventure


In [9]:
user_num = np.unique(dfmovieratings_users["UserID"]).shape[0]
genre_num = np.unique(dfmovieratings_users["Genres"]).shape[0]
gender_num = 2
embedding_num = 8

In [10]:
genre_dict = {genre:i for genre,i in zip(np.unique(dfmovieratings_users["Genres"]),range(genre_num))}
genre_dict

{'Action': 0,
 'Action|Adventure': 1,
 'Action|Adventure|Animation': 2,
 "Action|Adventure|Animation|Children's|Fantasy": 3,
 'Action|Adventure|Animation|Horror|Sci-Fi': 4,
 "Action|Adventure|Children's": 5,
 "Action|Adventure|Children's|Comedy": 6,
 "Action|Adventure|Children's|Fantasy": 7,
 "Action|Adventure|Children's|Sci-Fi": 8,
 'Action|Adventure|Comedy': 9,
 'Action|Adventure|Comedy|Crime': 10,
 'Action|Adventure|Comedy|Horror': 11,
 'Action|Adventure|Comedy|Horror|Sci-Fi': 12,
 'Action|Adventure|Comedy|Romance': 13,
 'Action|Adventure|Comedy|Sci-Fi': 14,
 'Action|Adventure|Comedy|War': 15,
 'Action|Adventure|Crime': 16,
 'Action|Adventure|Crime|Drama': 17,
 'Action|Adventure|Crime|Thriller': 18,
 'Action|Adventure|Drama': 19,
 'Action|Adventure|Drama|Romance': 20,
 'Action|Adventure|Drama|Sci-Fi|War': 21,
 'Action|Adventure|Drama|Thriller': 22,
 'Action|Adventure|Fantasy': 23,
 'Action|Adventure|Fantasy|Sci-Fi': 24,
 'Action|Adventure|Horror': 25,
 'Action|Adventure|Horror|Thril

In [11]:
gender_dict = {"M":1,"F":0}

In [12]:
age_dict = {1:"Under 18",18:"18-24",25:"25-34",35:"35-44",45:"45-49",50:"50-55",56:"56+"}

In [13]:
def map_genres(x):
    try:
        x["Genres"] = genre_dict[x["Genres"]]
        return x
    except:
        x["Genres"] = np.nan

def map_gender(x):
    try:
        x["Gender"] = gender_dict[x["Gender"]]
        return x
    except:
        x["Gender"] = "NotSet"

In [14]:
dfmovieratings_users = dfmovieratings_users.apply(lambda x:map_genres(x),axis = 1)
dfmovieratings_users = dfmovieratings_users.apply(lambda x:map_gender(x),axis = 1)

dfmovieratings_users.head()


Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Genres
0,1,1836,5,0,1,239
1,8,1836,4,1,25,239
2,53,1836,5,1,25,239
3,187,1836,3,0,45,239
4,352,1836,4,1,18,239


In [15]:
dftrain, dfvalid = train_test_split(dfmovieratings_users, test_size=0.2)

In [16]:


user_id_input = keras.layers.Input(shape = [1])
user_embed = keras.layers.Embedding(user_num + 1,embedding_num)(user_id_input)
user_vector = keras.layers.Reshape((embedding_num,))(user_embed)
# user_flattened = keras.layers.Flatten()(user_vector)


genre_input = keras.layers.Input(shape = [1])
genre_embed = keras.layers.Embedding(genre_num + 1,embedding_num)(genre_input)
genre_vector = keras.layers.Reshape((embedding_num,))(genre_embed)
# genre_flattened = keras


gender_input = keras.layers.Input(shape = [1])
gender_embed = keras.layers.Embedding(gender_num + 1,embedding_num)(gender_input)
gender_vector = keras.layers.Reshape((embedding_num,))(gender_embed)

age_input = keras.layers.Input(shape = [1])
age_embed = keras.layers.Embedding(7 + 1,embedding_num)(age_input)
age_vector = keras.layers.Reshape((embedding_num,))(age_embed)

# age_input = keras.layers.Input(shape = [1])

concat = keras.layers.concatenate([user_vector,genre_vector,gender_vector,age_input])
flattened_all = keras.layers.Flatten()(concat)

onebeforelast = keras.layers.Dense(units = 32,activation = "relu")(flattened_all)

out = keras.layers.Dense(units = 1)(onebeforelast)
model = keras.Model([user_id_input,genre_input,gender_input,age_input],out)
model.compile(optimizer='adam',loss='mean_squared_error')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 8)         48328       input_1[0][0]                    
______________________________________________________________________________________________

In [17]:
history = model.fit([dftrain["UserID"],dftrain["Genres"],dftrain["Gender"],dftrain["Age"]],dftrain["Rating"], epochs = 10,validation_data=([dfvalid["UserID"],dfvalid["Genres"],dfvalid["Gender"],dfvalid["Age"]],dfvalid["Rating"]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


Making Predictions

In [19]:
dfratings_test = dfratings_test.merge(dfmovie,on="MovieID",how="inner")

In [20]:
dfratings_test = dfratings_test.merge(dfusers,on="UserID",how = "inner")

In [21]:
dfratings_test.drop(["Title","OccupationID","Zip-code","timestamps"],inplace=True,axis = 1)
dfratings_test

Unnamed: 0,UserID,MovieID,Genres,Gender,Age
0,1,914,Musical|Romance,F,1
1,1,2018,Animation|Children's,F,1
2,1,2797,Comedy|Fantasy,F,1
3,1,1270,Comedy|Sci-Fi,F,1
4,1,1545,Drama,F,1
...,...,...,...,...,...
200011,2908,188,Horror,M,18
200012,1452,2065,Comedy|Drama|Romance,F,56
200013,1452,3893,Comedy|Thriller,F,56
200014,1452,371,Comedy|Drama,F,56


In [22]:
dfratings_test.dropna(inplace=True)

In [23]:
dfratings_test = dfratings_test.apply(lambda x:map_genres(x),axis = 1)

In [24]:
dfratings_test.head()

Unnamed: 0,UserID,MovieID,Genres,Gender,Age
0,1.0,914.0,280.0,F,1.0
1,1.0,2018.0,144.0,F,1.0
2,1.0,2797.0,192.0,F,1.0
3,1.0,1270.0,211.0,F,1.0
4,1.0,1545.0,239.0,F,1.0


In [25]:
dfratings_test = dfratings_test.apply(lambda x:map_gender(x),axis = 1)
dfratings_test.head()

Unnamed: 0,UserID,MovieID,Genres,Gender,Age
0,1.0,914.0,280.0,0.0,1.0
1,1.0,2018.0,144.0,0.0,1.0
2,1.0,2797.0,192.0,0.0,1.0
3,1.0,1270.0,211.0,0.0,1.0
4,1.0,1545.0,239.0,0.0,1.0


In [26]:
dfratings_test.loc[(dfratings_test["Gender"] != 0) &(dfratings_test["Gender"] != 1)]

Unnamed: 0,UserID,MovieID,Genres,Gender,Age
46054,,,,,
137225,,,,,
180362,,,,,


In [27]:
dfratings_test.dropna(inplace = True)

In [28]:
prediction = model.predict([dfratings_test["UserID"],dfratings_test["Genres"],dfratings_test["Gender"],dfratings_test["Age"]])

In [29]:
final_prediction = pd.concat((dfratings_test,pd.DataFrame(data = prediction)),axis = 1)
final_prediction.columns = ['UserID',	'MovieID'	,'Genres',	'Gender'	,'Age'	,'Rating']
# .to_csv("Q5_ratings.csv")
final_prediction

Unnamed: 0,UserID,MovieID,Genres,Gender,Age,Rating
0,1.0,914.0,280.0,0.0,1.0,4.569126
1,1.0,2018.0,144.0,0.0,1.0,4.078046
2,1.0,2797.0,192.0,0.0,1.0,3.868294
3,1.0,1270.0,211.0,0.0,1.0,3.878521
4,1.0,1545.0,239.0,0.0,1.0,4.200209
...,...,...,...,...,...,...
200011,2908.0,188.0,272.0,1.0,18.0,3.916013
200012,1452.0,2065.0,187.0,0.0,56.0,3.806314
200013,1452.0,3893.0,213.0,0.0,56.0,
200014,1452.0,371.0,185.0,0.0,56.0,


In [30]:
final_prediction[['UserID','MovieID',"Rating"]].to_csv('Q5_output.csv')

Task 2

I have used Item-Item Collaborative filtering

In [31]:
#note that collaborating filtering doesnt use othr features
dfratings_movie_train = dfratings_train.merge(dfmovie[["MovieID","Title"]],on="MovieID",how="inner").sort_values(by = "UserID").reset_index(drop=True)[["UserID","MovieID","Rating","Title"]]
dfratings_movie_train

Unnamed: 0,UserID,MovieID,Rating,Title
0,1,1836,5,"Last Days of Disco, The (1998)"
1,1,1287,5,Ben-Hur (1959)
2,1,661,3,James and the Giant Peach (1996)
3,1,3114,4,Toy Story 2 (1999)
4,1,3105,5,Awakenings (1990)
...,...,...,...,...
800188,6040,541,4,Blade Runner (1982)
800189,6040,2248,3,Say Anything... (1989)
800190,6040,2456,1,"Fly II, The (1989)"
800191,6040,3552,2,Caddyshack (1980)


In [32]:
#pivot the above table
#A very sparse matrix is the result
#If we were doing user based collaborative filtering, we would find similarity between users
#If we were doing item based collaborative filtering, we would find similarity between items
df_ratings_train_pivot = dfratings_movie_train.pivot_table(index = ["UserID"],columns = ["MovieID"],values="Rating")
df_ratings_train_pivot

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [33]:
#calculate the similarity/correlation between each item(movie)
#Intersection between any movie and another movie is the correlation between each movie
#we dont want to use only spurious information. Instead we want to correlate when there are 
#at least a minimum specified number of instances that have rated any 2 given movies.
# Hence I have used min_periods of 50
corrMatrix = df_ratings_train_pivot.corr(method = 'pearson',min_periods = 50)
corrMatrix.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.184111,0.174303,0.262667,0.159896,0.058462,0.148289,,,0.097885,...,,,,,,0.223139,0.155944,,,0.21484
2,0.184111,1.0,0.116584,,0.471901,0.051095,0.262461,,,0.268339,...,,,,,,0.146091,0.065245,,,0.166463
3,0.174303,0.116584,1.0,0.307334,0.411205,0.164042,0.15981,,,0.217726,...,,,,,,0.355267,,,,
4,0.262667,,0.307334,1.0,0.356038,,-0.026171,,,,...,,,,,,,,,,
5,0.159896,0.471901,0.411205,0.356038,1.0,0.118255,0.376373,,,0.275331,...,,,,,,0.496429,,,,


In [40]:
users = np.unique(dfratings_test["UserID"])
user_list = list()
movie_list = list()
ratings_list = list()
for user in users:
    movie_ids_to_predict = np.unique(dfratings_test.loc[dfratings_test["UserID"] == user]["MovieID"])
    #all ratings given by the user
    userratings = df_ratings_train_pivot.loc[user,:].dropna()
    similarmovies = pd.Series()
    for i in userratings.index:
            #here we get all the movies that have a correlation with the movie in concern 
            correlationswithmovie = corrMatrix[i].dropna()
            #correlation between each movie in concern and other movies is multiplied with the users ranking for
            #the movie in concern
            #essentially, we are scaling each movie that has a correlation with a movie that user has rated, by the 
            #rating that the user has given to the movies he rated
            #movies that were rated higher by user will result in movies of high correlation with that 
            #respective movie to also be rated higher
            #movies that were rated low will cause other movies to be rated low due to the correlation
            ratingswithmovie = correlationswithmovie.map(lambda x:x*userratings[i])
            similarmovies = similarmovies.append(ratingswithmovie)
    similarmovies.sort_values(inplace = True,ascending = False)
    similarmovies = similarmovies.groupby(similarmovies.index).mean().sort_values(ascending = False)
    # movies_user_hasnt_rated = similarmovies.drop(userratings.index)
    for movie in movie_ids_to_predict.tolist():
        user_list.append(user)
        movie_list.append(movie)
        try:
            rating = similarmovies[movie]
            ratings_list.append(rating)
        except:
            ratings_list.append("Not Available")

    result = pd.DataFrame({'UserId':user_list,'MovieID':movie_list,'Rating':ratings_list})

  similarmovies = pd.Series()


KeyboardInterrupt: 