In [10]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# 1. Fix the data loading with encoding parameter
def get_download_link(share_link):
    file_id = share_link.split('/')[-2]
    return f'https://drive.google.com/uc?export=download&id={file_id}'

# Load the datasets with encoding specification
try:
    # Try different encodings
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            ratings = pd.read_csv(get_download_link("https://drive.google.com/file/d/1tsyX0IZhr59VfyZM-_MbtimD2TpReU-v/view?usp=sharing"), 
                                encoding=encoding)
            advertisements = pd.read_csv(get_download_link("https://drive.google.com/file/d/1JpkyVWg_K8gG7ZaH8GDXf1GQqgr3XinY/view?usp=sharing"), 
                                      encoding=encoding)
            print(f"Successfully loaded with {encoding} encoding")
            break
        except UnicodeDecodeError:
            continue
    
    # Basic data exploration
    n_ratings = len(ratings)
    n_advertisements = len(ratings['ad_id'].unique())
    n_users = len(ratings['userId'].unique())

    print(f"Number of ratings: {n_ratings}")
    print(f"Number of unique ad_id's: {n_advertisements}")
    print(f"Number of unique users: {n_users}")
    
    # Create the user-item matrix
    def create_matrix(df):
        N = len(df['userId'].unique())
        M = len(df['ad_id'].unique())
        
        # Map Ids to indices
        user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
        advertisement_mapper = dict(zip(np.unique(df["ad_id"]), list(range(M))))
        
        # Map indices to IDs
        user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
        advertisement_inv_mapper = dict(zip(list(range(M)), np.unique(df["ad_id"])))
        
        user_index = [user_mapper[i] for i in df['userId']]
        advertisement_index = [advertisement_mapper[i] for i in df['ad_id']]

        X = csr_matrix((df["rating"], (advertisement_index, user_index)), shape=(M, N))
        
        return X, user_mapper, advertisement_mapper, user_inv_mapper, advertisement_inv_mapper

    X, user_mapper, advertisement_mapper, user_inv_mapper, advertisement_inv_mapper = create_matrix(ratings)

    def find_similar_advertisements(advertisement_id, X, k, metric='cosine', show_distance=False):
        try:
            neighbour_ids = []
            
            advertisement_ind = advertisement_mapper[advertisement_id]
            advertisement_vec = X[advertisement_ind]
            k += 1
            kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
            kNN.fit(X)
            advertisement_vec = advertisement_vec.reshape(1,-1)
            
            if show_distance:
                distances, indices = kNN.kneighbors(advertisement_vec)
                for i in range(1, len(distances.flatten())):
                    neighbour_ids.append(advertisement_inv_mapper[indices.flatten()[i]])
                return neighbour_ids, distances.flatten()[1:]
            else:
                indices = kNN.kneighbors(advertisement_vec, return_distance=False)
                for i in range(1, len(indices.flatten())):
                    neighbour_ids.append(advertisement_inv_mapper[indices.flatten()[i]])
                return neighbour_ids
                
        except Exception as e:
            print(f"Error finding similar advertisements: {str(e)}")
            return []

    # Create advertisement titles dictionary
    advertisement_titles = dict(zip(advertisements['ad_id'], advertisements['title']))

    # Test the recommendation system
    advertisement_id = 10
    similar_ids = find_similar_advertisements(advertisement_id, X, k=10)
    
    if advertisement_id in advertisement_titles:
        advertisement_title = advertisement_titles[advertisement_id]
        print(f"\nSince you watched {advertisement_title}")
        for i in similar_ids:
            if i in advertisement_titles:
                print(advertisement_titles[i])
    else:
        print(f"Advertisement ID {advertisement_id} not found in the dataset")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Successfully loaded with latin1 encoding
Number of ratings: 100836
Number of unique ad_id's: 9724
Number of unique users: 610

Since you watched ASSORTED COLOUR BIRD ORNAMENT
CREAM HEART CARD HOLDER
ENGLISH ROSE HOT WATER BOTTLE
SET 20 NAPKINS FAIRY CAKES DESIGN 
BREAD BIN DINER STYLE PINK
REX CASH+CARRY JUMBO SHOPPER
HAND WARMER RED POLKA DOT
MINI PAINT SET VINTAGE 
3 TIER CAKE TIN RED AND CREAM
PACK OF 12 PINK POLKADOT TISSUES
FELTCRAFT DOLL EMILY
