In [1]:
%pip install scikit-surprise
import re
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split
import json



Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m112.6/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505177 sha256=551e9e4a6cd92a3105b8254c28e333baef199401280d8c77875d0f712dcf2519
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
def parse_log_file(txt_file_path):

    rating_events = []

    rating_pattern = re.compile(r'GET /rate/([^=]+)=(\d+)')

    with open(txt_file_path, 'r') as f:
        for line in f:
            line = line.strip()
            # Each line is expected to have 3 comma-separated parts
            parts = line.split(',')
            if len(parts) != 3:
                continue
            timestamp, user_id, request = parts
            user_id = user_id.strip()

            # Check for rating event using regex
            rating_match = rating_pattern.search(request)
            if rating_match:
                movie_id = rating_match.group(1)
                rating = int(rating_match.group(2))
                rating_events.append({
                    'timestamp': timestamp,
                    'user_id': user_id,
                    'movie_id': movie_id,
                    'rating': rating
                })
            # Optionally, you can also handle watch events if you want to derive implicit feedback.
            # For now, we only use explicit rating events.

    return pd.DataFrame(rating_events)

In [15]:
def parse_log_file(json_file_path):

      rating_events = []

      # Open and load JSON file
      with open(json_file_path, 'r') as f:
          data = json.load(f)  # Load JSON data

      # Iterate through JSON objects
      for entry in data:
          if entry.get("type") == "rating":  # Ensure it's a rating event
              rating_events.append({
                  'timestamp': entry.get("raw").split(',')[0],  # Extract timestamp from raw field
                  'user_id': entry["user_details"].get("user_id"),
                  'age': entry["user_details"].get("age"),
                  'occupation': entry["user_details"].get("occupation"),
                  'gender': entry["user_details"].get("gender"),
                  'movie_id': entry.get("movieid"),
                  'rating': int(entry.get("rating"))  # Convert rating to integer
              })

      # Convert list to DataFrame
      return pd.DataFrame(rating_events)


In [88]:
# 1. Parse the Log File
log_file_path = '/content/drive/MyDrive/rating_events.json'
df = parse_log_file(log_file_path)

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader)


In [89]:
#2. Split the Data
trainset, testset = train_test_split(data, test_size=0.2)

In [90]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}

In [91]:
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f084a1b0a50>

In [59]:
def get_top_n(predictions, n=20):

    # Create a dictionary of predictions for each user
    top_n = {}

    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [180]:
def get_top_20_recommendations_for_user(user_id, df_predict):
    all_movie_ids = set(df_predict['movie_id'].unique())

    # Get the list of movies that the user has already rated
    rated_movies = set(df_predict[df_predict['user_id'] == user_id]['movie_id'])

    # Get the list of unrated movies for this user
    unrated_movies = all_movie_ids - rated_movies

    # Generate predictions for all unrated movies
    predictions = []
    for movie_id in unrated_movies:
        prediction = algo.predict(user_id, movie_id)
        predictions.append((movie_id, prediction.est))

    # Sort the predictions
    predictions.sort(key=lambda x: x[1], reverse = True)

    top_20 = predictions[:20]

    # Get the top 20 movie IDs
    top_20_movie_ids = [str(movie_id) for movie_id, _ in top_20]
    top_20_movie_ids_output = ",".join(top_20_movie_ids)

    top_20_with_ratings = [f"{movie_id}: {rating:.2f}" for movie_id, rating in top_20]
    top_20_with_ratings_output = ",".join(top_20_with_ratings)

    return top_20_movie_ids_output, top_20_with_ratings_output

Shared train and test data

In [193]:
file_path = '/content/drive/MyDrive/cb_train_data.csv'
df_2 = pd.read_csv(file_path)
df_2 = df_2[['user_id', 'movie_id', 'rating']]

file_path_1 = "/content/drive/MyDrive/cb_test_data.csv"
df_3 = pd.read_csv(file_path_1)
df_3 = df_3[['user_id', 'movie_id', 'rating']]

df = df[['user_id', 'movie_id', 'rating']]

reader = Reader(rating_scale=(1, 5))

df_2 = df_2.drop_duplicates()
df_3 = df_3.drop_duplicates()

df_2 = df_2.dropna()
df_3 = df_3.dropna()

trainset = Dataset.load_from_df(df_2, reader).build_full_trainset()

#using shared test data or rating_event.json
testset_shared = Dataset.load_from_df(df, reader).build_full_trainset().build_testset()
testset_rating = Dataset.load_from_df(df_3, reader).build_full_trainset().build_testset()

In [190]:
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f084253ec90>

In [191]:
predictions = algo.test(testset_shared)

# Compute RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

# Compute MAE
mae = accuracy.mae(predictions)
print(f"MAE: {mae}")

RMSE: 1.0936
RMSE: 1.09364249156831
MAE:  0.9065
MAE: 0.9065131498138723


In [192]:
predictions = algo.test(testset_rating)

# Compute RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

# Compute MAE
mae = accuracy.mae(predictions)
print(f"MAE: {mae}")

RMSE: 0.2324
RMSE: 0.23236940265080647
MAE:  0.0737
MAE: 0.07371159209953466


In [184]:
user_id = 102833 # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

Top 20 recommended movies for user 102833:
pleasantville+1998,operation+dumbo+drop+1995,last+action+hero+1993,the+6th+day+2000,all+quiet+on+the+western+front+1930,halloween+h20+1998,bringing+up+baby+1938,american+gangster+2007,the+brady+bunch+movie+1995,over+the+hedge+2006,rain+man+1988,the+running+man+1987,ferris+buellers+day+off+1986,the+legend+of+drunken+master+1994,the+talented+mr.+ripley+1999,the+dark+knight+2008,the+machinist+2004,blade+1998,kick-ass+2010,grumpier+old+men+1995
pleasantville+1998: 4.00,operation+dumbo+drop+1995: 4.00,last+action+hero+1993: 4.00,the+6th+day+2000: 4.00,all+quiet+on+the+western+front+1930: 4.00,halloween+h20+1998: 4.00,bringing+up+baby+1938: 4.00,american+gangster+2007: 4.00,the+brady+bunch+movie+1995: 4.00,over+the+hedge+2006: 4.00,rain+man+1988: 4.00,the+running+man+1987: 4.00,ferris+buellers+day+off+1986: 4.00,the+legend+of+drunken+master+1994: 4.00,the+talented+mr.+ripley+1999: 4.00,the+dark+knight+2008: 4.00,the+machinist+2004: 4.00,blade+1998: 

In [183]:
user_id = 6566 # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

Top 20 recommended movies for user 6566:
over+the+hedge+2006,the+blues+brothers+1980,zombieland+2009,the+naked+gun+from+the+files+of+police+squad+1988,inception+2010,serendipity+2001,jack+1996,harold+and+maude+1971,annie+hall+1977,tampopo+1985,the+apartment+1960,pleasantville+1998,operation+dumbo+drop+1995,last+action+hero+1993,the+6th+day+2000,all+quiet+on+the+western+front+1930,halloween+h20+1998,bringing+up+baby+1938,american+gangster+2007,the+brady+bunch+movie+1995
over+the+hedge+2006: 5.00,the+blues+brothers+1980: 4.88,zombieland+2009: 4.76,the+naked+gun+from+the+files+of+police+squad+1988: 4.76,inception+2010: 4.76,serendipity+2001: 4.73,jack+1996: 4.73,harold+and+maude+1971: 4.21,annie+hall+1977: 4.21,tampopo+1985: 4.21,the+apartment+1960: 4.21,pleasantville+1998: 3.88,operation+dumbo+drop+1995: 3.88,last+action+hero+1993: 3.88,the+6th+day+2000: 3.88,all+quiet+on+the+western+front+1930: 3.88,halloween+h20+1998: 3.88,bringing+up+baby+1938: 3.88,american+gangster+2007: 3.88,the+br

In [181]:
user_id = 32206  # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

Top 20 recommended movies for user 32206:
the+matrix+1999,enter+the+dragon+1973,pleasantville+1998,operation+dumbo+drop+1995,last+action+hero+1993,the+6th+day+2000,all+quiet+on+the+western+front+1930,halloween+h20+1998,bringing+up+baby+1938,american+gangster+2007,the+brady+bunch+movie+1995,over+the+hedge+2006,rain+man+1988,the+running+man+1987,ferris+buellers+day+off+1986,the+legend+of+drunken+master+1994,the+talented+mr.+ripley+1999,the+dark+knight+2008,the+machinist+2004,blade+1998
the+matrix+1999: 4.31,enter+the+dragon+1973: 4.31,pleasantville+1998: 3.71,operation+dumbo+drop+1995: 3.71,last+action+hero+1993: 3.71,the+6th+day+2000: 3.71,all+quiet+on+the+western+front+1930: 3.71,halloween+h20+1998: 3.71,bringing+up+baby+1938: 3.71,american+gangster+2007: 3.71,the+brady+bunch+movie+1995: 3.71,over+the+hedge+2006: 3.71,rain+man+1988: 3.71,the+running+man+1987: 3.71,ferris+buellers+day+off+1986: 3.71,the+legend+of+drunken+master+1994: 3.71,the+talented+mr.+ripley+1999: 3.71,the+dark+knig

In [187]:
user_id = 19755 # The user ID for whom we want the recommendations
top_20_movie_ids, top_20_with_ratings = get_top_20_recommendations_for_user(user_id, df_2)

print(f"Top 20 recommended movies for user {user_id}:")
print(top_20_movie_ids)
print(top_20_with_ratings)

Top 20 recommended movies for user 19755:
galaxy+quest+1999,harry+potter+and+the+philosophers+stone+2001,blade+runner+1982,harold+and+maude+1971,annie+hall+1977,tampopo+1985,the+apartment+1960,wild+things+1998,pleasantville+1998,operation+dumbo+drop+1995,last+action+hero+1993,the+6th+day+2000,all+quiet+on+the+western+front+1930,halloween+h20+1998,bringing+up+baby+1938,american+gangster+2007,the+brady+bunch+movie+1995,over+the+hedge+2006,rain+man+1988,the+running+man+1987
galaxy+quest+1999: 5.00,harry+potter+and+the+philosophers+stone+2001: 5.00,blade+runner+1982: 4.84,harold+and+maude+1971: 4.44,annie+hall+1977: 4.44,tampopo+1985: 4.44,the+apartment+1960: 4.44,wild+things+1998: 4.24,pleasantville+1998: 4.11,operation+dumbo+drop+1995: 4.11,last+action+hero+1993: 4.11,the+6th+day+2000: 4.11,all+quiet+on+the+western+front+1930: 4.11,halloween+h20+1998: 4.11,bringing+up+baby+1938: 4.11,american+gangster+2007: 4.11,the+brady+bunch+movie+1995: 4.11,over+the+hedge+2006: 4.11,rain+man+1988: 4.