In [13]:
%pip install scikit-surprise
import re
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split
import json
import numpy as np


In [7]:
def parse_json_file(json_file_path):

      rating_events = []

      # Open and load JSON file
      with open(json_file_path, 'r') as f:
          data = json.load(f)  # Load JSON data

      # Iterate through JSON objects
      for entry in data:
          if entry.get("type") == "rating":
              rating_events.append({
                'timestamp': entry.get("raw").split(',')[0],
                'user_id': entry.get("user_details.user_id"),
                'age': entry.get("user_details.age"),
                'occupation': entry.get("user_details.occupation"),
                'gender': entry.get("user_details.gender"),
                'movie_id': entry.get("movieid"),
                'rating': int(entry.get("rating"))
            })

      # Convert list to DataFrame
      return pd.DataFrame(rating_events)


In [7]:
# 1. Parse the Log File
file_path = '/kaggle/input/rating/deduped_rating_events.json'
df_2 = parse_json_file(file_path)
df_2 = df_2[['user_id', 'movie_id', 'rating']]

reader = Reader(rating_scale=(1, 5))
trainset = Dataset.load_from_df(df_2, reader).build_full_trainset()

In [9]:
#2. Split the Data
#trainset, testset = train_test_split(data, test_size=0.2)

In [71]:
import random

def get_top_300_movies(df):
    """Get top 300 movies based on average rating."""
    top_movies = df.groupby("movie_id")["rating"].mean().nlargest(300).index.tolist()
    return top_movies

def get_top_20_recommendations_for_user(user_id, df, top_movies):
    all_movie_ids = set(df['movie_id'].unique())

    rated_movies = set(df[df['user_id'] == user_id]['movie_id'])

    unrated_movies = all_movie_ids - rated_movies

    # If user has no rated movies, return random 20 from top 300
    if len(rated_movies) == 0:
        return random.sample(top_movies, 20)
    
    predictions = [algo.predict(user_id, movie) for movie in unrated_movies]
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Check if all top 20 have the same score
    top_20 = predictions[:20]
    max_score = top_20[0].est if top_20 else None
    same_score_movies = [p for p in predictions if p.est == max_score]
    
    # If top 20 all have the same score, randomly choose 20 from them
    if len(same_score_movies) > 20:
        top_20 = random.sample(same_score_movies, 20)
    
    top_20_movie_ids = [p.iid for p in top_20]
    top_20_movie_ids_output = ",".join(map(str, top_20_movie_ids))

    top_20_with_ratings = [f"{p.iid}: {p.est:.2f}" for p in top_20]
    top_20_with_ratings_output = ",".join(top_20_with_ratings)
    
    return top_20_movie_ids_output, top_20_with_ratings_output

Shared train and test data

In [57]:
top_movies = get_top_300_movies(df_2)  # Precompute top 300 movies

In [None]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}

In [11]:
algo = KNNWithMeans(sim_options=sim_options)

In [14]:
start_time = time.time()
for batch in np.array_split(df_2, 10):  # Split into 10 batches
    trainset = Dataset.load_from_df(batch[['user_id', 'movie_id', 'rating']], reader).build_full_trainset()
    algo.fit(trainset)

end_time = time.time()

  return bound(*args, **kwds)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [None]:
print(f"Total training time: {end_time - start_time:.2f} seconds")

In [52]:
import pickle

with open("/kaggle/working/knn_model.pkl", "wb") as f:
    pickle.dump(algo, f)

In [30]:
file_path_1 = "/kaggle/input/testset/cb_test_data.csv"
df_3 = pd.read_csv(file_path_1)
df_3 = df_3[['user_id', 'movie_id', 'rating']]


reader = Reader(rating_scale=(1, 5))

df_3 = df_3.drop_duplicates()

df_3 = df_3.dropna()


#using shared test data or rating_event.json
testset = Dataset.load_from_df(df_3, reader).build_full_trainset().build_testset()

In [31]:
predictions = algo.test(testset)

# Compute RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

# Compute MAE
mae = accuracy.mae(predictions)
print(f"MAE: {mae}")

RMSE: 1.0921
RMSE: 1.0921320406343609
MAE:  0.8862
MAE: 0.8861559816822056


In [72]:
user_id = 61139 # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2, top_movies)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

Top 20 recommended movies for user 61139:
dial+m+for+murder+1954,a+city+of+sadness+1989,generation+kill+2008,under+the+skin+1997,los+angeles+plays+itself+2003,the+dark+knight+rises+2012,leon+the+professional+1994,la+notte+1961,cousin_+cousine+1975,alice+1990,the+shaolin+temple+1982,she-wolf+of+london+1946,first+love_+last+rites+1998,manakamana+2013,fat+man+and+little+boy+1989,swords+of+blood+1962,indictment+the+mcmartin+trial+1995,his+private+secretary+1933,war+of+the+shaolin+temple+1980,one+man+up+2001
dial+m+for+murder+1954: 5.00,a+city+of+sadness+1989: 4.50,generation+kill+2008: 4.50,under+the+skin+1997: 4.50,los+angeles+plays+itself+2003: 4.50,the+dark+knight+rises+2012: 4.50,leon+the+professional+1994: 4.33,la+notte+1961: 4.33,cousin_+cousine+1975: 4.00,alice+1990: 4.00,the+shaolin+temple+1982: 4.00,she-wolf+of+london+1946: 4.00,first+love_+last+rites+1998: 4.00,manakamana+2013: 4.00,fat+man+and+little+boy+1989: 4.00,swords+of+blood+1962: 4.00,indictment+the+mcmartin+trial+1995: 4

In [65]:
user_id = 41746 # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2, top_movies)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

{'princesses+2005', 'el+cantante+2006', 'the+equalizer+2014', 'pulp+fiction+1994', 'platoon+1986'}
Top 20 recommended movies for user 41746:
gifted+hands+the+ben+carson+story+2009,the+queen+2006,the+in-laws+1979,sleeping+beauty+1959,dances+with+wolves+1990,my+sassy+girl+2001,life+of+pi+2012,the+worlds+end+2013,indiana+jones+and+the+last+crusade+1989,gladiator+2000,willy+wonka++the+chocolate+factory+1971,blood+and+black+lace+1964,thor+the+dark+world+2013,armored+car+robbery+1950,cousin_+cousine+1975,alice+1990,the+shaolin+temple+1982,she-wolf+of+london+1946,please+vote+for+me+2007,first+love_+last+rites+1998
gifted+hands+the+ben+carson+story+2009: 4.75,the+queen+2006: 4.67,the+in-laws+1979: 4.50,sleeping+beauty+1959: 4.50,dances+with+wolves+1990: 4.50,my+sassy+girl+2001: 4.50,life+of+pi+2012: 4.50,the+worlds+end+2013: 4.50,indiana+jones+and+the+last+crusade+1989: 4.50,gladiator+2000: 4.33,willy+wonka++the+chocolate+factory+1971: 4.33,blood+and+black+lace+1964: 4.33,thor+the+dark+world+2

In [66]:
user_id = 3959  # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2, top_movies)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

{'the+rock+1996', 'just+sex+and+nothing+else+2005', 'willy+wonka++the+chocolate+factory+1971'}
Top 20 recommended movies for user 3959:
wrong+turn+5+bloodlines+2012,benjamin+blmchen+-+seine+schnsten+abenteuer+2006,i+still+know+what+you+did+last+summer+1998,un+chien+andalou+1929,sopyonje+1993,aftermath+1994,untitled+2009,blood+of+the+beasts+1949,fando+and+lis+1968,major+dundee+1965,angela+1995,zombie+high+1987,true+crime+1996,breast+men+1997,the+ugly+dachshund+1966,wuthering+heights+1970,english+vinglish+2012,command+decision+1948,bernie+2012,paradise+hope+2013
wrong+turn+5+bloodlines+2012: 3.73,benjamin+blmchen+-+seine+schnsten+abenteuer+2006: 3.73,i+still+know+what+you+did+last+summer+1998: 3.73,un+chien+andalou+1929: 3.73,sopyonje+1993: 3.73,aftermath+1994: 3.73,untitled+2009: 3.73,blood+of+the+beasts+1949: 3.73,fando+and+lis+1968: 3.73,major+dundee+1965: 3.73,angela+1995: 3.73,zombie+high+1987: 3.73,true+crime+1996: 3.73,breast+men+1997: 3.73,the+ugly+dachshund+1966: 3.73,wuthering+

In [67]:
user_id = 19755 # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2, top_movies)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

{'invitation+to+the+dance+1956'}
Top 20 recommended movies for user 19755:
anne+of+green+gables+1934,dark+angel+1990,the+girl+in+the+park+2007,drive+2011,wwii+in+hd+2009,the+good_+the+bad+and+the+ugly+1966,the+tie+that+binds+1995,the+bravados+1958,the+mugger+2007,the+country+girl+1954,the+war+is+over+1966,the+story+of+luke+2013,eyes+wide+shut+1999,muddy+river+1981,hot+summer+week+1972,drums+along+the+mohawk+1939,carolina+2003,the+pumpkin+eater+1964,violent+saturday+1955,in+july+2000
anne+of+green+gables+1934: 3.73,dark+angel+1990: 3.73,the+girl+in+the+park+2007: 3.73,drive+2011: 3.73,wwii+in+hd+2009: 3.73,the+good_+the+bad+and+the+ugly+1966: 3.73,the+tie+that+binds+1995: 3.73,the+bravados+1958: 3.73,the+mugger+2007: 3.73,the+country+girl+1954: 3.73,the+war+is+over+1966: 3.73,the+story+of+luke+2013: 3.73,eyes+wide+shut+1999: 3.73,muddy+river+1981: 3.73,hot+summer+week+1972: 3.73,drums+along+the+mohawk+1939: 3.73,carolina+2003: 3.73,the+pumpkin+eater+1964: 3.73,violent+saturday+1955: 3.7