In [17]:
import pandas as pd
import sqlite3
import numpy as np

In [2]:
#importing data stored in the database to pandas dataframe

database_name = 'RecommendationDatabase.db'
conn = sqlite3.connect(database_name)

# Define SQL query to select all data from the table
sql_query_ratings = "SELECT * FROM Ratings;"
sql_query_movies = "SELECT * FROM Movies"

# Read data from SQLite database into a pandas DataFrame
reviews = pd.read_sql_query(sql_query_ratings, conn)
movies = pd.read_sql_query(sql_query_movies, conn)


conn.close()

#### Creating User-Item Matrix

In [5]:
user_items = reviews[['user_id', 'movie_id', 'ratings']]
user_items.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1074638,7
1,1,1853728,8
2,2,104257,8
3,2,1259521,8
4,2,1991245,7


In [37]:
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['ratings'].max().unstack()
user_by_movie

movie_id,0002844,0004936,0007264,0008133,0009968,0011717,0012349,0013086,0013427,0013442,...,2860716,2865258,2866028,2867096,2872256,2902646,2917728,2926790,2930428,2937482
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10451,,,,,,,,,,,...,,,,,,,,,,
10452,,,,,,,,,,,...,,,,,,,,,,
10453,,,,,,,,,,,...,,,,,,,,,,
10454,,,,,,,,,,,...,,,,,,,,,,


In [11]:
#Creating an array of movies that each user has seen
def movies_watched(user_id):
   
    movies = user_by_movie.loc[user_id][user_by_movie.loc[user_id].isnull() == False].index.values

    return movies

#creating a dictionary where the key is the user_id and the values are array of the movies(movie_id) that the user has seen.
def create_user_movie_dict():
  
    n_users = user_by_movie.shape[0]
    movies_seen = dict()

    for user1 in range(1, n_users+1):
        
        # assign list of movies to each user key
        movies_seen[user1] = movies_watched(user1)
    
    return movies_seen

movies_seen = create_user_movie_dict()

In [12]:
movies_seen

{1: array(['1074638', '1853728'], dtype=object),
 2: array(['0104257', '1259521', '1991245'], dtype=object),
 3: array(['1300854'], dtype=object),
 4: array(['0385002', '1220198', '1462900', '1512685', '1631707', '1986994',
        '1999995'], dtype=object),
 5: array(['1389096', '2125608'], dtype=object),
 6: array(['0039834', '0074958', '0099277', '0102800', '0117774', '0119167',
        '0251736', '0395584', '0489244', '0937231', '0938330', '1219289',
        '1379182', '1425253', '1470827', '1571222', '1592525', '1703199',
        '1711018', '1714208', '2231554'], dtype=object),
 7: array(['1853728'], dtype=object),
 8: array(['1320082', '1483013', '1583421', '1623205'], dtype=object),
 9: array(['0245429', '0405159', '1045658', '1300854', '1951261'],
       dtype=object),
 10: array(['0079417', '0405094'], dtype=object),
 11: array(['0103064'], dtype=object),
 12: array(['1343092'], dtype=object),
 13: array(['1343092', '1623205'], dtype=object),
 14: array(['0042192', '0058461', 

In [42]:
#removing users who have seen less than 2 movies.

def create_movies_to_analyze(movies_seen, lower_bound=2):
   
    movies_to_analyze = dict()

    for user, movies in movies_seen.items():
        if len(movies) > lower_bound:
            movies_to_analyze[user] = movies
    return movies_to_analyze

movies_to_analyze = create_movies_to_analyze(movies_seen)

movies_to_analyze

{2: array(['0104257', '1259521', '1991245'], dtype=object),
 4: array(['0385002', '1220198', '1462900', '1512685', '1631707', '1986994',
        '1999995'], dtype=object),
 6: array(['0039834', '0074958', '0099277', '0102800', '0117774', '0119167',
        '0251736', '0395584', '0489244', '0937231', '0938330', '1219289',
        '1379182', '1425253', '1470827', '1571222', '1592525', '1703199',
        '1711018', '1714208', '2231554'], dtype=object),
 8: array(['1320082', '1483013', '1583421', '1623205'], dtype=object),
 9: array(['0245429', '0405159', '1045658', '1300854', '1951261'],
       dtype=object),
 14: array(['0042192', '0058461', '0059578', '0083907', '0092991', '0106308',
        '0117407', '1172570', '1288558', '1300854', '1440292', '1483013',
        '1620604', '1682180', '2053463', '2085910', '2101441'],
       dtype=object),
 16: array(['0022279', '0031235', '0037558', '0048424', '0110413', '0113277',
        '0116242', '0119488', '0163978', '1010048', '1024648', '106758

In [15]:
#Finding the correlation between the matching ratings between the two users

def compute_correlation(user1, user2):
    
    # Pull movies for each user
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    
    # Find Similar Movies
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)
    
    # Calculate correlation between the users
    df = user_by_movie.loc[(user1, user2), sim_movs]
    corr = df.transpose().corr().iloc[0,1]
    
    return corr #return the correlation



In [23]:
#Calculating Eculidean Distance

def compute_euclidean_dist(user1, user2):

    # Pull movies for each user
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    # Find Similar Movies
    sim_movs = np.intersect1d(movies1, movies2, assume_unique=True)
    
    # Calculate euclidean distance between the users
    df = user_by_movie.loc[(user1, user2), sim_movs]
    dist = np.linalg.norm(df.loc[user1] - df.loc[user2])
    
    return dist 

In [40]:
#creating dataframe to store euclidean distance between users

df_dists = pd.DataFrame(columns = ['user1', 'user2', 'eucl_dist'])
df_dists

Unnamed: 0,user1,user2,eucl_dist


In [None]:
users = list(movies_to_analyze.keys())

# Create a DataFrame to store distances
df_dists = pd.DataFrame(columns = ['user1', 'user2', 'eucl_dist'])

# Compute distances and fill the DataFrame
for user1 in users:
    for user2 in users:
        if user1 != user2:
            dist = compute_euclidean_dist(user1, user2)
            df_dists.at[user1, user2] = dist

# Show the distances DataFrame
print(df_dists)

  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1, user2] = dist
  df_dists.at[user1,

In [None]:
df_dists.to_csv("users_euclidean_distance.csv")