# Problem Statement
We have a movie lens database and our objective is to apply various kinds of recommendation techniques from scratch and find out similarities between the users, most popular movies, and personalized recommendations for the targeted user based on user based collaborative filtering.

# Setup and Data Exploration

In [1]:
# Importing the required libraries.
import pandas as pd
from sklearn.model_selection import train_test_split
from math import pow, sqrt

In [2]:
# Reading users dataset into a pandas dataframe object.
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/Users/Srilakshmi/Downloads/users.dat', sep='::', names=u_cols,
 encoding='latin-1')

  after removing the cwd from sys.path.


In [3]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
# Reading ratings dataset into a pandas dataframe object.
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('/Users/Srilakshmi/Downloads/ratings.dat', sep='::', names=r_cols,
 encoding='latin-1')

  after removing the cwd from sys.path.


In [9]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
# Reading movies dataset into a pandas dataframe object.
m_cols = ['movie_id', 'movie_title', 'genre']
movies = pd.read_csv('/Users/Srilakshmi/Downloads/movies.dat', sep='::', names=m_cols, encoding='latin-1')

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
movies.head()

Unnamed: 0,movie_id,movie_title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


As seen in the above dataframe, the genre column has data with pipe separators which cannot be processed for recommendations as such. Hence, we need to genrate columns for every genre type such that if the movie belongs to that genre its value will be 1 otheriwse 0.(Sort of one hot encoding)

In [12]:
# Getting series of lists by applying split operation.
movies.genre = movies.genre.str.split('|')

# Getting distinct genre types for generating columns of genre type.
genre_columns = list(set([j for i in movies['genre'].tolist() for j in i]))

# Iterating over every list to create and fill values into columns.
for j in genre_columns:
    movies[j] = 0
for i in range(movies.shape[0]):
    for j in genre_columns:
        if(j in movies['genre'].iloc[i]):
            movies.loc[i,j] = 1

In [13]:
movies.head()

Unnamed: 0,movie_id,movie_title,genre,Romance,Film-Noir,Western,Crime,Comedy,Thriller,Sci-Fi,...,Horror,Action,Musical,Drama,War,Adventure,Documentary,Animation,Fantasy,Children's
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Also, we need to separate the year part of the 'movie_title' columns for better interpretability and processing. Hence, a columns named 'release_year' will be created using the below code.

In [14]:
# Separting movie title and year part using split function
split_values = movies['movie_title'].str.split("(", n = 1, expand = True) 

# setting 'movie_title' values to title part and creating 'release_year' column.
movies.movie_title = split_values[0]
movies['release_year'] = split_values[1]

# Cleaning the release_year series and dropping 'genre' columns as it has already been one hot encoded.
movies['release_year'] = movies.release_year.str.replace(')','')
movies.drop('genre',axis=1,inplace=True)

Let's visualize all the dataframes after all the preprocessing we did.

In [15]:
movies.head()

Unnamed: 0,movie_id,movie_title,Romance,Film-Noir,Western,Crime,Comedy,Thriller,Sci-Fi,Mystery,...,Action,Musical,Drama,War,Adventure,Documentary,Animation,Fantasy,Children's,release_year
0,1,Toy Story,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1995
1,2,Jumanji,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1995
2,3,Grumpier Old Men,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,4,Waiting to Exhale,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [16]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [17]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [18]:
ratings.shape

(1000209, 4)

### Writing generally used getter functions in the implementation
Here, we have written down a few getters so that we do not need to write down them again adn again and it also increases readability and reusability of the code.

In [19]:
#Function to get the rating given by a user to a movie.
def get_rating_(userid,movieid):
    return (ratings.loc[(ratings.user_id==userid) & (ratings.movie_id == movieid),'rating'].iloc[0])

# Function to get the list of all movie ids the specified user has rated.
def get_movieids_(userid):
    return (ratings.loc[(ratings.user_id==userid),'movie_id'].tolist())

# Function to get the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movie_id == movieid),'movie_title'].iloc[0])

## Similarity Scores
***
In this implementation the similarity between the two users have been calculated on the basis of the distance between the two users (i.e. Euclidean distances) and by calculating Pearson Correlation between the two users.

We have written two functions.

In [20]:
def distance_similarity_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    both_watch_count = 0
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            both_watch_count += 1
    if both_watch_count == 0 :
        return 0
    distance = []
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            rating1 = get_rating_(user1,element)
            rating2 = get_rating_(user2,element)
            distance.append(pow(rating1 - rating2, 2))
    total_distance = sum(distance)
    return 1/(1+sqrt(total_distance))

In [21]:
distance_similarity_score(1,310)

0.14459058185587106

Calculating Similarity Scores based on the distances have an inherent problem. We do not have a threshold to decide how much more distance between two users is to be considered for calculating whether the users are close enough or far enough. On the other side, this problem is resolved by pearson correlation method as it always returns a value between -1 & 1 which clearly provides us with the boundaries for closeness as we prefer.

In [22]:
def pearson_correlation_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    both_watch_count = []
    for element in ratings.loc[ratings.user_id==user1,'movie_id'].tolist():
        if element in ratings.loc[ratings.user_id==user2,'movie_id'].tolist():
            both_watch_count.append(element)
    if len(both_watch_count) == 0 :
        return 0
    rating_sum_1 = sum([get_rating_(user1,element) for element in both_watch_count])
    rating_sum_2 = sum([get_rating_(user2,element) for element in both_watch_count])
    rating_squared_sum_1 = sum([pow(get_rating_(user1,element),2) for element in both_watch_count])
    rating_squared_sum_2 = sum([pow(get_rating_(user2,element),2) for element in both_watch_count])
    product_sum_rating = sum([get_rating_(user1,element) * get_rating_(user2,element) for element in both_watch_count])
    
    numerator = product_sum_rating - ((rating_sum_1 * rating_sum_2) / len(both_watch_count))
    denominator = sqrt((rating_squared_sum_1 - pow(rating_sum_1,2) / len(both_watch_count)) * (rating_squared_sum_2 - pow(rating_sum_2,2) / len(both_watch_count)))
    if denominator == 0:
        return 0
    return numerator/denominator

In [23]:
pearson_correlation_score(1,310)

0.1453526052506179

### Most Similar Users

The objective is to find out **Most Similar Users** to the targeted user. Here we have two metrics to find the score i.e. distance and correlation. 

In [24]:
def most_similar_users_(user1,number_of_users,metric='pearson'):
    '''
    user1 : Targeted User
    number_of_users : number of most similar users you want to user1.
    metric : metric to be used to calculate inter-user similarity score. ('pearson' or else)
    '''
    # Getting distinct user ids.
    user_ids = ratings.user_id.unique().tolist()
    
    # Getting similarity score between targeted and every other suer in the list(or subset of the list).
    if(metric == 'pearson'):
        similarity_score = [(pearson_correlation_score(user1,nth_user),nth_user) for nth_user in user_ids[:100] if nth_user != user1]
    else:
        similarity_score = [(distance_similarity_score(user1,nth_user),nth_user) for nth_user in user_ids[:100] if nth_user != user1]
    
    # Sorting in descending order.
    similarity_score.sort()
    similarity_score.reverse()
    
    # Returning the top most 'number_of_users' similar users. 
    return similarity_score[:number_of_users]



## Getting Movie Recommendations for Targeted User
***
The concept is very simple. First, we need to iterate over only those movies not watched(or rated) by the targeted user and the subsetting items based on the users highly correlated with targeted user. Here, we have used a weighted similarity approach where we have taken product of rating and score into account to make sure that the highly similar users affect the recommendations more than those less similar. Then, we have sorted the list on the basis of score along with movie ids and returned the movie titles against those movie ids.



In [25]:
def get_recommendation_(userid):
    user_ids = ratings.user_id.unique().tolist()
    total = {}
    similariy_sum = {}
    
    # Iterating over subset of user ids.
    for user in user_ids[:100]:
        
        # not comparing the user to itself (obviously!)
        if user == userid:
            continue
        
        # Getting similarity score between the users.
        score = pearson_correlation_score(userid,user)
        
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
    return recommendations[:10]

**NOTE**: We have applied the above three techniques only to specific subset of the dataset as the dataset is too big and iterating over every row multiple times will increase runtime manifolds.

### Implementations

In [27]:
print(get_recommendation_(320))

['Contender, The ', 'Requiem for a Dream ', 'Bamboozled ', 'Invisible Man, The ', 'Creature From the Black Lagoon, The ', 'Hellraiser ', 'Almost Famous ', 'Way of the Gun, The ', 'Shane ', 'Naked Gun 2 1/2: The Smell of Fear, The ']
