In [3]:
!pip install scikit-surprise



In [4]:
!pip install chardet

# Import the required module
import chardet
# Open the movies.dat file for reading
with open('movies.dat', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

# Use chardet to detect the encoding of the file

# Print the detected encoding
print(f"Detected encoding: {encoding}")

Detected encoding: ISO-8859-1


In [5]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

# Load data from .dat files
ratings_df = pd.read_csv('ratings.dat', sep='::', engine='python', header=None,
                         names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
movies_df = pd.read_csv('movies.dat', sep='::', engine='python', header=None,
                        names=['MovieID', 'Title', 'Genres'], encoding=encoding)

# Preprocessing: Create genre_ratings DataFrame
genre_ratings = []
for index, row in ratings_df.iterrows():
    movie_genres = movies_df[movies_df['MovieID'] == row['MovieID']]['Genres'].values[0].split('|')
    for genre in movie_genres:
        genre_ratings.append((row['UserID'], genre, row['Rating']))

genre_ratings_df = pd.DataFrame(genre_ratings, columns=['UserID', 'Genre', 'Rating'])

# Save genre ratings to 'genre.dat'
genre_ratings_df.to_csv('genre.dat', index=False)

In [6]:
# Convert genre_ratings_df to Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(genre_ratings_df[['UserID', 'Genre', 'Rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)


# K-means clustering
# Create a pivot table for genre ratings
pivot_table = pd.pivot_table(genre_ratings_df, values='Rating', index='UserID', columns='Genre', fill_value=0)

# Fit k-means clustering
kmeans = KMeans(n_clusters=5)  # Adjust the number of clusters as needed
kmeans.fit(pivot_table)

def recommend_movies(user_id, n=10):
    # Get the cluster label of the user
    user_cluster_label = kmeans.predict([pivot_table.loc[user_id]])

    # Get users within the same cluster
    cluster_users = kmeans.labels_ == user_cluster_label

    # Filter ratings of users in the same cluster
    cluster_ratings = pivot_table.iloc[cluster_users]

    # Calculate the average rating for each movie
    avg_ratings = cluster_ratings.mean()

    # Sort movies by average rating and recommend the top n movies
    top_movies = avg_ratings.sort_values(ascending=False).head(n)

    # Retrieve movie titles based on Genre
    recommended_movies = movies_df[movies_df['Genres'].str.contains(top_movies.index[0])]['Title']
    return recommended_movies

# Example usage:
user_id = 104
recommended_movies = recommend_movies(user_id)
print("Recommended movies for User id: ",user_id);
print(recommended_movies)





Recommended movies for User id:  104
162                      Devil in a Blue Dress (1995)
317                                     Suture (1993)
537                               Blade Runner (1982)
698                           Mulholland Falls (1996)
736                              Force of Evil (1948)
901                        Maltese Falcon, The (1941)
910     Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
918                                  Notorious (1946)
930                                      Laura (1944)
1054                                 Crossfire (1947)
1055                          Murder, My Sweet (1944)
1136                        He Walked by Night (1948)
1137                                  Raw Deal (1948)
1138                                     T-Men (1947)
1163                             Grifters, The (1990)
1228                             Touch of Evil (1958)
1232                                 Chinatown (1974)
1240                                         



In [7]:
import math

def compute_inverse_sigma(sigma_k):
    inverse_sigma_k = np.zeros_like(sigma_k.T)  # Initialize the inverse matrix with zeros
    for i in range(min(sigma_k.shape)):
        if sigma_k[i, i] != 0:
            inverse_sigma_k[i, i] = 1 / sigma_k[i, i]
    return inverse_sigma_k

# Implement Singular Value Decomposition (SVD)
def custom_svd(A):
    num_users = len(A)
    num_genres = len(A.columns)

    print(num_users,num_genres)

    # Transpose of A times A
    ATA = A.T @ A
    # AAT = A @ A.T

    print('ATA size {}'.format(ATA.shape))

    # Eigenvalues and eigenvectors of ATA
    eigenvalues, eigenvectors = np.linalg.eig(ATA)

    print("eigen values vectors done")

    # Sort eigenvalues in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    print("sorted vectors")
    S=[[0 for _ in range(num_genres)] for _ in range(num_users)]
    for i in range(num_genres):
        S[i][i] = math.sqrt(sorted_eigenvalues[i])

    S=np.array(S)

    print('sigma size {}'.format(S.shape))

    print('A {} V {} S {}'.format(A.shape, sorted_eigenvectors.shape, S.shape))

    # U matrix
    U = A @ sorted_eigenvectors @ compute_inverse_sigma(S)
    Vt = sorted_eigenvectors
    print(U)
    return U, S, Vt

# Reduce dimensionality
def reduce_dimensionality(U, sigma, Vt, k):

     # Convert U to a list of lists
    U_list = U.values.tolist()

    U_k = np.array([row[:k] for row in U_list])
    sigma_k = np.array([[sigma[i][j] if i == j else 0.0 for j in range(k)] for i in range(k)])
    Vt_k = np.array([row[:k] for row in Vt])

    return U_k, sigma_k, Vt_k

# Reconstruct the user-item matrix
def reconstruct_matrix(U_k, sigma_k, Vt_k):
    reconstructed_matrix = [[0] * len(Vt_k[0]) for _ in range(len(U_k))]
    reconstructed_matrix=U_k @ sigma_k @ Vt_k.T
    return reconstructed_matrix

In [8]:
user_genre_avg_ratings = genre_ratings_df.groupby(['UserID', 'Genre'])['Rating'].mean().reset_index()

# Pivot the user_genre_avg_ratings to get the user-genre matrix with average ratings
user_genre_matrix = user_genre_avg_ratings.pivot_table(index='UserID', columns='Genre', values='Rating', fill_value=0)

# Display the user-genre matrix with average ratings
print("User-Genre Matrix with Average Ratings:")
print(user_genre_matrix)

User-Genre Matrix with Average Ratings:
Genre     Action  Adventure  Animation  Children's    Comedy     Crime  \
UserID                                                                   
1       4.200000   4.000000   4.111111    4.250000  4.142857  4.000000   
2       3.500000   3.736842   0.000000    0.000000  3.560000  3.583333   
3       3.956522   4.000000   4.000000    4.000000  3.766667  0.000000   
4       4.157895   3.833333   0.000000    4.000000  0.000000  5.000000   
5       2.612903   3.000000   4.000000    3.833333  3.410714  3.285714   
...          ...        ...        ...         ...       ...       ...   
6036    3.000000   2.987952   3.911765    3.444444  3.203065  3.528302   
6037    3.642857   4.000000   4.000000    3.666667  3.576271  3.833333   
6038    3.000000   4.000000   3.666667    3.000000  3.833333  0.000000   
6039    4.000000   4.100000   3.615385    3.529412  3.723077  4.000000   
6040    2.976190   2.818182   3.000000    4.000000  3.274510  3.920000  

In [9]:
U, sigma, Vt = custom_svd(user_genre_matrix)

print('U size {}'.format(U.shape))
print('sigma size {}'.format(sigma.shape))
print('Vt size {}'.format(Vt.shape))

6040 18
ATA size (18, 18)
eigen values vectors done
sorted vectors
sigma size (6040, 18)
A (6040, 18) V (18, 18) S (6040, 18)
            0         1         2         3         4         5         6     \
UserID                                                                         
1      -0.012447 -0.022036 -0.024687 -0.015701  0.004437  0.004703 -0.010629   
2      -0.011368 -0.005980  0.027606  0.023397 -0.013759  0.006680  0.004013   
3      -0.012886 -0.007925 -0.026097  0.015377 -0.009282  0.002040  0.001362   
4      -0.011002 -0.016658 -0.003863  0.018414 -0.029082  0.021962  0.010605   
5      -0.012109  0.017805 -0.001100 -0.006269 -0.001800 -0.023552  0.012652   
...          ...       ...       ...       ...       ...       ...       ...   
6036   -0.013105  0.018286 -0.002774 -0.004088 -0.003579 -0.002271  0.001645   
6037   -0.014788  0.013903 -0.005569 -0.003778 -0.008340  0.006504 -0.001174   
6038   -0.008391 -0.020671 -0.011349 -0.015537  0.002816 -0.002463  0.0251