In [1]:
!pip install pandas numpy scikit-learn datasketch

Defaulting to user installation because normal site-packages is not writeable
Collecting datasketch
  Downloading datasketch-1.6.4-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.4-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.4


In [10]:
import pandas as pd
from datasketch import MinHash, MinHashLSH
import numpy as np
import random
from tqdm import tqdm

In [2]:
# Load the small dataset
ratings = pd.read_csv('ratings-large.csv')

# Display the first few rows of the dataframe
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [3]:
ratings=ratings.drop('timestamp',axis=1)

In [4]:
ratings.shape

(33832162, 3)

In [6]:
def preprocess_data(ratings):
    user_movies = ratings.groupby('userId')['movieId'].apply(set).to_dict()
    return user_movies

def create_minhashes(user_movies, num_perm=30):
    minhashes = {}
    for user, movies in tqdm.tqdm(user_movies.items()):
        m = MinHash(num_perm=num_perm)
        for movie in movies:
            m.update(str(movie).encode('utf8'))
        minhashes[user] = m
    return minhashes

user_movies = preprocess_data(ratings)
minhashes = create_minhashes(user_movies)


  0%|          | 0/330975 [00:00<?, ?it/s][A
  0%|          | 95/330975 [00:00<05:57, 924.76it/s][A
  0%|          | 188/330975 [00:00<06:06, 901.59it/s][A
  0%|          | 280/330975 [00:00<06:04, 907.36it/s][A
  0%|          | 383/330975 [00:00<05:46, 953.85it/s][A
  0%|          | 479/330975 [00:00<06:09, 895.43it/s][A
  0%|          | 570/330975 [00:00<06:57, 790.85it/s][A
  0%|          | 690/330975 [00:00<06:03, 908.81it/s][A
  0%|          | 798/330975 [00:00<05:45, 955.58it/s][A
  0%|          | 897/330975 [00:00<05:52, 935.74it/s][A
  0%|          | 993/330975 [00:01<06:02, 911.24it/s][A
  0%|          | 1086/330975 [00:01<06:07, 896.80it/s][A
  0%|          | 1177/330975 [00:01<06:12, 885.98it/s][A
  0%|          | 1276/330975 [00:01<06:00, 915.09it/s][A
  0%|          | 1371/330975 [00:01<05:56, 925.20it/s][A
  0%|          | 1464/330975 [00:01<05:59, 916.75it/s][A
  0%|          | 1565/330975 [00:01<05:48, 943.98it/s][A
  1%|          | 1674/330975 [00:01<

In [11]:
def find_similar_users(minhashes, num_perm=30, threshold=0.9, num_bands=6):
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm, params=(num_bands, num_perm // num_bands))
    
    for user, mi in tqdm(minhashes.items()):
        lsh.insert(user, mi)
    
    similar_users = set()
    for user, mi in tqdm(minhashes.items()):
        result = lsh.query(mi)
        for other in result:
            if user != other:
                pair = tuple(sorted((user, other)))
                similar_users.add(pair)
    
    return list(similar_users)

similar_users = find_similar_users(minhashes)

# Debugging: Check length of similar_users
print(f"Total similar pairs found: {len(similar_users)}")

top_100_pairs = sorted(similar_users, key=lambda x: len(set(user_movies[x[0]]).intersection(user_movies[x[1]])), reverse=True)[:100]

# Output top 100 pairs
top_100_pairs

100%|██████████| 330975/330975 [00:10<00:00, 31852.43it/s]
100%|██████████| 330975/330975 [00:27<00:00, 11914.53it/s]


Total similar pairs found: 22215721


[(211359, 240754),
 (111917, 207216),
 (116432, 247373),
 (73700, 270601),
 (48766, 213593),
 (33457, 48766),
 (78255, 185341),
 (134353, 233891),
 (48766, 247086),
 (211238, 236260),
 (50012, 247373),
 (155990, 204495),
 (174815, 247373),
 (50012, 174815),
 (50012, 116432),
 (77647, 294432),
 (116432, 174815),
 (77647, 174815),
 (77647, 86967),
 (86967, 294432),
 (174815, 294432),
 (25084, 294432),
 (25084, 86967),
 (25084, 77647),
 (77647, 247373),
 (247373, 294432),
 (50012, 77647),
 (50012, 294432),
 (86967, 174815),
 (233891, 249280),
 (214831, 314721),
 (25084, 174815),
 (86967, 247373),
 (50012, 86967),
 (25084, 247373),
 (48766, 113052),
 (25084, 50012),
 (77647, 116432),
 (211238, 214831),
 (116432, 294432),
 (86967, 116432),
 (25084, 116432),
 (236260, 267187),
 (83880, 261244),
 (160061, 247086),
 (86967, 261244),
 (83880, 86967),
 (73700, 76618),
 (261244, 294432),
 (77647, 261244),
 (25084, 261244),
 (77647, 83880),
 (83880, 294432),
 (25084, 83880),
 (247373, 261244),
 (1

In [16]:
similar_users[:10]

[(99547, 215589),
 (120522, 275602),
 (188652, 254917),
 (35456, 194776),
 (268237, 330557),
 (93708, 179204),
 (47551, 291404),
 (4279, 124476),
 (13476, 186922),
 (155624, 275413)]

In [14]:
def calculate_average_correlation(pairs, ratings):
    correlations = []
    for user1, user2 in tqdm(pairs):
        user1_ratings = ratings[ratings['userId'] == user1][['movieId', 'rating']].set_index('movieId')
        user2_ratings = ratings[ratings['userId'] == user2][['movieId', 'rating']].set_index('movieId')
        common_movies = user1_ratings.index.intersection(user2_ratings.index)
        
        if len(common_movies) > 0:
            correlation = user1_ratings.loc[common_movies]['rating'].corr(user2_ratings.loc[common_movies]['rating'])
            if not np.isnan(correlation):
                correlations.append(correlation)
    
    return np.mean(correlations)

# Calculate correlation for top 100 pairs
average_correlation_top_100 = calculate_average_correlation(top_100_pairs, ratings)
print(f"Average Correlation for Top 100 Pairs: {average_correlation_top_100}")

# Pick 100 random pairs of users
all_users = list(user_movies.keys())
random_pairs = [(random.choice(all_users), random.choice(all_users)) for _ in range(100)]
average_correlation_random_100 = calculate_average_correlation(random_pairs, ratings)
print(f"Average Correlation for Random 100 Pairs: {average_correlation_random_100}")


100%|██████████| 100/100 [00:07<00:00, 13.37it/s]


Average Correlation for Top 100 Pairs: 0.7348021838603093


  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.

Average Correlation for Random 100 Pairs: 0.032287987393481585



