In [1]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm

In [2]:
#importing data stored in the database to pandas dataframe

database_name = 'RecommendationDatabase.db'
conn = sqlite3.connect(database_name)

# Define SQL query to select all data from the table
sql_query_ratings = "SELECT * FROM Ratings;"
sql_query_movies = "SELECT * FROM Movies"

# Read data from SQLite database into a pandas DataFrame
reviews = pd.read_sql_query(sql_query_ratings, conn)
movies = pd.read_sql_query(sql_query_movies, conn)


conn.close()

#### Creating User-Item Matrix

In [3]:
user_items = reviews[['user_id', 'movie_id', 'ratings']]
user_items.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1074638,7
1,1,1853728,8
2,2,104257,8
3,2,1259521,8
4,2,1991245,7


In [4]:
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['ratings'].max().unstack()
user_by_movie

movie_id,0002844,0004936,0007264,0008133,0009968,0011717,0012349,0013086,0013427,0013442,...,2860716,2865258,2866028,2867096,2872256,2902646,2917728,2926790,2930428,2937482
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10451,,,,,,,,,,,...,,,,,,,,,,
10452,,,,,,,,,,,...,,,,,,,,,,
10453,,,,,,,,,,,...,,,,,,,,,,
10454,,,,,,,,,,,...,,,,,,,,,,


In [5]:
#Creating an array of movies that each user has seen
def movies_watched(user_id):
   
    movies = user_by_movie.loc[user_id][user_by_movie.loc[user_id].isnull() == False].index.values

    return movies

#creating a dictionary where the key is the user_id and the values are array of the movies(movie_id) that the user has seen.
def create_user_movie_dict():
  
    n_users = user_by_movie.shape[0]
    movies_seen = dict()

    for user1 in range(1, n_users+1):
        
        # assign list of movies to each user key
        movies_seen[user1] = movies_watched(user1)
    
    return movies_seen

movies_seen = create_user_movie_dict()

In [6]:
movies_seen

{1: array(['1074638', '1853728'], dtype=object),
 2: array(['0104257', '1259521', '1991245'], dtype=object),
 3: array(['1300854'], dtype=object),
 4: array(['0385002', '1220198', '1462900', '1512685', '1631707', '1986994',
        '1999995'], dtype=object),
 5: array(['1389096', '2125608'], dtype=object),
 6: array(['0039834', '0074958', '0099277', '0102800', '0117774', '0119167',
        '0251736', '0395584', '0489244', '0937231', '0938330', '1219289',
        '1379182', '1425253', '1470827', '1571222', '1592525', '1703199',
        '1711018', '1714208', '2231554'], dtype=object),
 7: array(['1853728'], dtype=object),
 8: array(['1320082', '1483013', '1583421', '1623205'], dtype=object),
 9: array(['0245429', '0405159', '1045658', '1300854', '1951261'],
       dtype=object),
 10: array(['0079417', '0405094'], dtype=object),
 11: array(['0103064'], dtype=object),
 12: array(['1343092'], dtype=object),
 13: array(['1343092', '1623205'], dtype=object),
 14: array(['0042192', '0058461', 

In [7]:
#removing users who have seen less than 2 movies.

def create_movies_to_analyze(movies_seen, lower_bound=2):
   
    movies_to_analyze = dict()

    for user, movies in movies_seen.items():
        if len(movies) > lower_bound:
            movies_to_analyze[user] = movies
    return movies_to_analyze

movies_to_analyze = create_movies_to_analyze(movies_seen)

movies_to_analyze

{2: array(['0104257', '1259521', '1991245'], dtype=object),
 4: array(['0385002', '1220198', '1462900', '1512685', '1631707', '1986994',
        '1999995'], dtype=object),
 6: array(['0039834', '0074958', '0099277', '0102800', '0117774', '0119167',
        '0251736', '0395584', '0489244', '0937231', '0938330', '1219289',
        '1379182', '1425253', '1470827', '1571222', '1592525', '1703199',
        '1711018', '1714208', '2231554'], dtype=object),
 8: array(['1320082', '1483013', '1583421', '1623205'], dtype=object),
 9: array(['0245429', '0405159', '1045658', '1300854', '1951261'],
       dtype=object),
 14: array(['0042192', '0058461', '0059578', '0083907', '0092991', '0106308',
        '0117407', '1172570', '1288558', '1300854', '1440292', '1483013',
        '1620604', '1682180', '2053463', '2085910', '2101441'],
       dtype=object),
 16: array(['0022279', '0031235', '0037558', '0048424', '0110413', '0113277',
        '0116242', '0119488', '0163978', '1010048', '1024648', '106758

In [8]:
#Finding the correlation between the matching ratings between the two users

def compute_correlation(user1, user2):
    
    # Pull movies for each user
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    
    # Find Similar Movies
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)
    
    # Calculate correlation between the users
    df = user_by_movie.loc[(user1, user2), sim_movs]
    corr = df.transpose().corr().iloc[0,1]
    
    return corr #return the correlation



In [15]:
compute_correlation(9, 3544)

-1.0

The euclidean distance, aka, similarity b/w users is calculated by sq. root of difference of movie rating ofsimilar movies they have.

In [22]:
#Calculating Eculidean Distance

def compute_euclidean_dist(user1, user2):

    # Pull movies for each user
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]

    # Find Similar Movies
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)

    
    # Calculate euclidean distance between the users
    df = user_by_movie.loc[(user1, user2), sim_movs]

    dist = np.linalg.norm(df.loc[user1] - df.loc[user2])  
    
    return dist 

In [23]:
compute_euclidean_dist(9, 3544)

1.7320508075688772

This is how the above operation is completed.

movies1 = ['0245429' '0405159' '1045658' '1300854' '1951261']

movies2 = ['0463985' '1045658' '1247667' '1300854' '1905041' '1951261']

sim_movs = ['1045658' '1300854' '1951261']

df = (below)

movie_id  1045658  1300854  1951261

user_id                            
9             7.0      8.0      7.0
3544          8.0      7.0      8.0

dist = 1.7320508075688772 (which is sq. root of 3)

In [28]:
dist_list = []
users = list(movies_to_analyze.keys())


for user1 in tqdm(users, desc="Progress", ncols=100):
    for user2 in users:
        if user1 != user2:
            dist = compute_euclidean_dist(user1, user2)
            dist_list.append((user1, user2, dist))
df_dists = pd.DataFrame(dist_list, columns=['user1', 'user2', 'eucl_dist'])

Progress: 100%|███████████████████████████████████████████████| 4131/4131 [5:06:06<00:00,  4.45s/it]


In [30]:
df_dists

Unnamed: 0,User1,User2,Distance
0,2,4,0.0
1,2,6,0.0
2,2,8,0.0
3,2,9,0.0
4,2,14,0.0
...,...,...,...
17061025,10455,10447,0.0
17061026,10455,10448,0.0
17061027,10455,10450,5.0
17061028,10455,10453,0.0


In [11]:
df_dists.to_csv("users_euclidean_distance.csv")

NameError: name 'df_dists' is not defined

In [39]:
df_dists['Distance'].nunique()

226

In [25]:
df_dists = pd.read_csv('users_euclidean_distance.csv')
df_dists.drop('Unnamed: 0', axis = 1, inplace = True)
df_dists

Unnamed: 0,user1,user2,eucl_dist
0,2,4,0.0
1,2,6,0.0
2,2,8,0.0
3,2,9,0.0
4,2,14,0.0
...,...,...,...
17061025,10455,10447,0.0
17061026,10455,10448,0.0
17061027,10455,10450,5.0
17061028,10455,10453,0.0


In [98]:
#creating funtion to find the nearest users to any given user
def find_closest_neighbors(user):
    
    closest_users = df_dists[df_dists['user1']==user].sort_values(by='eucl_dist').iloc[1:]['user2']
    closest_neighbors = np.array(closest_users)
    
    return closest_neighbors

find_closest_neighbors(2)

array([ 6841,  6843,  6847, ..., 10435,  2567,  5661], dtype=int64)

In [57]:
#creating funtion to fetch the movies ID that the user  watched and rated it higher than 7.
def movies_liked(user_id, min_rating=7):

    movies_liked = np.array(user_items.query('user_id == @user_id and ratings > (@min_rating -1)')['movie_id'])
    
    return movies_liked

movies_liked(2)

array(['0104257', '1259521', '1991245'], dtype=object)

In [58]:
# Finding out movie name using id

def movie_names(movie_ids):
    
    movie_lst = list(movies[movies['movie_id'].isin(movie_ids)]['movie'])
   
    return movie_lst

movie_names(['0104257'])

['A Few Good Men (1992)']

###  movies liked by user 2

In [60]:
movie_names(movies_liked(2))

['A Few Good Men (1992)',
 'The Cabin in the Woods (2011)',
 'Chernobyl Diaries (2012)']

In [89]:
def make_recommendations(user, num_recs=10):
    
   
    # movies_seen by user (we don't want to recommend these)
    movies_seen = movies_watched(user)
    closest_neighbors = find_closest_neighbors(user)
    

    recs = np.array([])
    
    # Go through the neighbors and identify movies they like the user hasn't seen
    for neighbor in closest_neighbors:
        neighbs_likes = movies_liked(neighbor)
        

        new_recs = np.setdiff1d(neighbs_likes, movies_seen, assume_unique=True)
        
    
        recs = np.unique(np.concatenate([new_recs, recs], axis=0))
        
        # If we have enough recommendations exit the loop
        if len(recs) > num_recs-1:
            break
    recs = recs[:num_recs] #getting the top num_recs movies
    # Pull movie titles using movie ids
    recommendations = movie_names(recs)
    
    return recommendations

In [90]:
#movie recommendaiton for user 2
make_recommendations(2)

['Apollo 13 (1995)',
 'Shrek (2001)',
 'The Incredibles (2004)',
 'Life of Pi (2012)',
 'Eagle Eye (2008)',
 'Law Abiding Citizen (2009)',
 'Evil Dead (2013)',
 'Kari-gurashi no Arietti (2010)',
 'Yüregine sor (2010)',
 'The Perks of Being a Wallflower (2012)']

In [91]:
#creating recommendation for all of the users.

def all_recommendations(num_recs=10):
   #num_recs represents the maximum number of recommendation for a user
    
    # All the users we need to make recommendations for
    users = np.unique(df_dists['user1'])
    n_users = len(users)
    
    #Store all recommendations in this dictionary
    all_recs = dict()
    
    # Make the recommendations for each user
    for user in tqdm(users, desc="Progress", ncols=100):
        all_recs[user] = make_recommendations(user, num_recs)
    
    return all_recs



In [92]:
all_recs = all_recommendations(10)

Progress: 100%|█████████████████████████████████████████████████| 4131/4131 [01:34<00:00, 43.89it/s]


In [93]:
for user, movies_recm in all_recs.items():
    print('User {} should be recommended these movies: '.format(user))
    for i, movie in enumerate(movies_recm):
        print('{}. {}'.format(i+1, movie))
    print()

User 2 should be recommended these movies: 
1. Apollo 13 (1995)
2. Shrek (2001)
3. The Incredibles (2004)
4. Life of Pi (2012)
5. Eagle Eye (2008)
6. Law Abiding Citizen (2009)
7. Evil Dead (2013)
8. Kari-gurashi no Arietti (2010)
9. Yüregine sor (2010)
10. The Perks of Being a Wallflower (2012)

User 4 should be recommended these movies: 
1. Swiss Family Robinson (1960)
2. Caddyshack (1980)
3. Vacation (1983)
4. European Vacation (1985)
5. 3 Men and a Baby (1987)
6. Christmas Vacation (1989)
7. Goodfellas (1990)
8. Happy Gilmore (1996)
9. Vegas Vacation (1997)
10. Live Free or Die Hard (2007)

User 6 should be recommended these movies: 
1. Eyes Wide Shut (1999)
2. Rushmore (1998)
3. 42 (2013)
4. Iron Man 3 (2013)
5. Star Trek Into Darkness (2013)
6. Oblivion (2013)
7. Oz the Great and Powerful (2013)
8. Killer Joe (2011)
9. The Place Beyond the Pines (2012)
10. The Call (2013)

User 8 should be recommended these movies: 
1. Gone with the Wind (1939)
2. The Godfather (1972)
3. The Godf