In [2]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357288 sha256=d9512287ec45c353a62554da04e0aa1b9bfde511a4a384a2980a02f273e4c9b1
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succ

In [3]:
!pip install faiss-cpu shap


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
import faiss
import shap
import matplotlib.pyplot as plt

# Load MovieLens data
movies = pd.read_csv('/content/movie.csv')
ratings = pd.read_csv('/content/rating.csv')

# Merge data for ease of use
data = pd.merge(ratings, movies, on='movieId')
data.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:

data = data.dropna()

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

data['userId_encoded'] = user_encoder.fit_transform(data['userId'])
data['movieId_encoded'] = item_encoder.fit_transform(data['movieId'])
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,userId_encoded,movieId_encoded
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy,0,1
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,0,28
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,0,31
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,0,46
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0,49


In [7]:
data['timestamp'] = pd.to_datetime(data['timestamp'])

data['date'] = data['timestamp'].dt.date
data['time'] = data['timestamp'].dt.time

print(data[['userId', 'movieId', 'rating', 'timestamp', 'date', 'time']].head())


   userId  movieId  rating           timestamp        date      time
0       1        2     3.5 2005-04-02 23:53:47  2005-04-02  23:53:47
1       1       29     3.5 2005-04-02 23:31:16  2005-04-02  23:31:16
2       1       32     3.5 2005-04-02 23:33:39  2005-04-02  23:33:39
3       1       47     3.5 2005-04-02 23:32:07  2005-04-02  23:32:07
4       1       50     3.5 2005-04-02 23:29:40  2005-04-02  23:29:40


In [8]:
data['date'] = pd.to_datetime(data['date'])

data['recency'] = (data['date'].max() - data['date']).dt.days

print(data[['userId', 'movieId', 'recency']].head())


   userId  movieId  recency
0       1        2     3650
1       1       29     3650
2       1       32     3650
3       1       47     3650
4       1       50     3650


In [9]:
data['recency'] = data['recency'] / data['recency'].max()

print(data[['userId', 'movieId', 'recency']].head())


   userId  movieId   recency
0       1        2  0.522399
1       1       29  0.522399
2       1       32  0.522399
3       1       47  0.522399
4       1       50  0.522399


In [10]:
user_freq = data.groupby('userId')['movieId'].count().reset_index().rename(columns={'movieId': 'frequency'})

data = pd.merge(data, user_freq[['userId', 'frequency']], on='userId', how='left')

print(data[['userId', 'movieId', 'recency', 'frequency']].head())


   userId  movieId   recency  frequency
0       1        2  0.522399        175
1       1       29  0.522399        175
2       1       32  0.522399        175
3       1       47  0.522399        175
4       1       50  0.522399        175


In [11]:
data['frequency'] = data['frequency'] / data['frequency'].max()

print(data[['userId', 'movieId', 'recency', 'frequency']].head())


   userId  movieId   recency  frequency
0       1        2  0.522399   0.023287
1       1       29  0.522399   0.023287
2       1       32  0.522399   0.023287
3       1       47  0.522399   0.023287
4       1       50  0.522399   0.023287


In [12]:
data['recency_frequency'] = data['recency'] * data['frequency']

print(data[['userId', 'movieId', 'recency', 'frequency', 'recency_frequency']].head())



   userId  movieId   recency  frequency  recency_frequency
0       1        2  0.522399   0.023287           0.012165
1       1       29  0.522399   0.023287           0.012165
2       1       32  0.522399   0.023287           0.012165
3       1       47  0.522399   0.023287           0.012165
4       1       50  0.522399   0.023287           0.012165


In [13]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

reader = Reader(rating_scale=(0, 5))
data_surprise = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data_surprise, test_size=0.2)

svd = SVD()

svd.fit(trainset)

predictions = svd.test(testset)

rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")


RMSE: 0.8126
RMSE: 0.8125537157950813


In [14]:
from surprise import accuracy
accuracy.rmse(predictions)




RMSE: 0.8126


0.8125537157950813

In [15]:
user_id = 1
movie_id = 10
predicted_rating = svd.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted_rating.est}")


Predicted rating for user 1 and movie 10: 3.6495419353561367


In [16]:


def apply_recency_boost(recommendation_scores, user_id):
    # Get the recency of the user
    user_recency = data[data['userId'] == user_id]['recency'].max()
    boosted_scores = recommendation_scores * (1 + user_recency)
    return boosted_scores


user_id = 1
recommendation_scores = np.random.rand(10)
boosted_scores = apply_recency_boost(recommendation_scores, user_id)
print(boosted_scores)


[1.24753212 0.43041783 0.75253164 0.59589659 0.84953173 0.52932359
 0.50888999 1.28887162 0.02306404 0.83925919]


In [19]:
data.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres',
       'userId_encoded', 'movieId_encoded', 'date', 'time', 'recency',
       'frequency', 'recency_frequency'],
      dtype='object')

In [20]:
from sklearn.cluster import KMeans


user_data = data[['userId', 'recency', 'frequency']].dropna()

kmeans = KMeans(n_clusters=3, random_state=42)
user_data['cluster'] = kmeans.fit_predict(user_data[['recency', 'frequency']])

print(user_data.head())


   userId   recency  frequency  cluster
0       1  0.522399   0.023287        0
1       1  0.522399   0.023287        0
2       1  0.522399   0.023287        0
3       1  0.522399   0.023287        0
4       1  0.522399   0.023287        0


In [22]:
import faiss
import numpy as np

vectors = np.random.random((1000, 128)).astype('float32')

index = faiss.IndexFlatL2(128)
index.add(vectors)

query_vector = np.random.random((1, 128)).astype('float32')
D, I = index.search(query_vector, 5)

print(f"Indices of top 5 similar items: {I}")
print(f"Distances of top 5 similar items: {D}")


Indices of top 5 similar items: [[239 684 179 355 883]]
Distances of top 5 similar items: [[14.919945 15.056761 15.424654 15.716432 15.876038]]


In [25]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



if 'recency' not in data.columns or 'frequency' not in data.columns:
    raise ValueError("Data must include 'recency' and 'frequency' columns")

scaler = StandardScaler()
data[['recency', 'frequency']] = scaler.fit_transform(data[['recency', 'frequency']])

kmeans = KMeans(n_clusters=3, random_state=42)
data['cluster'] = kmeans.fit_predict(data[['recency', 'frequency']])

def recommend_movies(user_id):
    if user_id not in data['userId'].unique():
        print(f"User with ID {user_id} not found.")
        return

    user_cluster = data[data['userId'] == user_id]['cluster'].iloc[0]

    cluster_movies = data[data['cluster'] == user_cluster]

    recommended_movies = cluster_movies.groupby('title').size().reset_index(name='count')
    recommended_movies = recommended_movies.sort_values('count', ascending=False)

    print(f"Recommended movies for user {user_id}:")
    print(recommended_movies[['title', 'count']])

user_id_input = int(input("Enter your userId: "))

recommend_movies(user_id_input)


Enter your userId: 1
Recommended movies for user 1:
                                     title  count
9087                    Matrix, The (1999)   4482
12460     Shawshank Redemption, The (1994)   4213
4996                     Fight Club (1999)   4053
5263                   Forrest Gump (1994)   4046
11266                  Pulp Fiction (1994)   4039
...                                    ...    ...
4549                        Endgame (2009)      1
4554                      Endurance (1999)      1
4564                      Enfer, L' (1994)      1
4503   Emigrants, The (Utvandrarna) (1971)      1
4504              Emil i Lönneberga (1971)      1

[15775 rows x 2 columns]


In [26]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np


if 'recency' not in data.columns or 'frequency' not in data.columns:
    raise ValueError("Data must include 'recency' and 'frequency' columns")

scaler = StandardScaler()
data[['recency', 'frequency']] = scaler.fit_transform(data[['recency', 'frequency']])

kmeans = KMeans(n_clusters=5, random_state=42)
data['cluster'] = kmeans.fit_predict(data[['recency', 'frequency']])

def encode_genres(data):
    genres = data['genres'].str.split('|', expand=True).stack().unique()
    genre_dict = {genre: i for i, genre in enumerate(genres)}
    return genre_dict

genre_dict = encode_genres(data)
def genre_vectorize(row, genre_dict):
    genres = row.split('|')
    genre_vector = np.zeros(len(genre_dict))
    for genre in genres:
        genre_vector[genre_dict[genre]] = 1
    return genre_vector

data['genre_vector'] = data['genres'].apply(genre_vectorize, args=(genre_dict,))

data['combined_features'] = data.apply(lambda row: np.concatenate([row['genre_vector'], [row['recency'], row['frequency']]]), axis=1)

def recommend_movies(user_id):
    if user_id not in data['userId'].unique():
        print(f"User with ID {user_id} not found.")
        return

    user_cluster = data[data['userId'] == user_id]['cluster'].iloc[0]

    cluster_movies = data[data['cluster'] == user_cluster]

    cluster_movies = cluster_movies.sort_values(by=['recency', 'frequency'], ascending=[False, False])

    user_genre_vector = data[data['userId'] == user_id]['genre_vector'].iloc[0]

    def genre_similarity(movie_genre_vector):
        return np.dot(user_genre_vector, movie_genre_vector)

    cluster_movies['genre_similarity'] = cluster_movies['genre_vector'].apply(genre_similarity)

    cluster_movies = cluster_movies.sort_values(by=['genre_similarity', 'recency', 'frequency'], ascending=[False, False, False])

    recommended_movies = cluster_movies[['title', 'recency', 'frequency', 'genre_similarity']].head(5)

    print(f"Recommended movies for user {user_id}:")
    print(recommended_movies[['title', 'recency', 'frequency', 'genre_similarity']])

user_id_input = int(input("Enter your userId: "))

recommend_movies(user_id_input)


Enter your userId: 1
Recommended movies for user 1:
                                   title   recency  frequency  \
2164998    Lord of the Rings, The (1978)  0.599744   0.569430   
2074210                     Shrek (2001)  0.587947   0.484841   
83355              Monsters, Inc. (2001)  0.581512   0.499178   
732140          Wizard of Oz, The (1939)  0.580440   0.557960   
732175   Escape to Witch Mountain (1975)  0.580440   0.557960   

         genre_similarity  
2164998               3.0  
2074210               3.0  
83355                 3.0  
732140                3.0  
732175                3.0  
