<a href="https://colab.research.google.com/github/TranThiDieuHien/Cac-thuat-toan-toi-uu/blob/main/MatrixFactorize_Proximal_Gradient_Descent_Lite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Cài đặt thư viện

In [None]:
import pandas as pd
!pip install tabml -q
import tabml.datasets
from sklearn.model_selection import train_test_split

df_dict = tabml.datasets.download_movielen_1m()
users, movies, ratings = df_dict["users"], df_dict["movies"], df_dict["ratings"]

train_ratings, validation_ratings = train_test_split(
    ratings, test_size=0.1, random_state=42
)

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.5/357.5 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

# 2. Mô tả tập dữ liệu

In [None]:
users_in_validation = validation_ratings["UserID"].unique()
all_users = users["UserID"].unique()

print(f"There are {len(users_in_validation)} users in validation set.")
print(f"Total number of users: {len(all_users)}")

There are 5970 users in validation set.
Total number of users: 6040


In [None]:
movie_index_by_id = {id: i for i, id in enumerate(movies["MovieID"])}

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
genre_index_by_name = {name:i for i, name in enumerate(genres)}

import numpy as np
# build binary array for movie genres
movie_features = np.zeros((len(movies), len(genres)))
for i, movie_genres in enumerate(movies["Genres"]):
    for genre in movie_genres.split("|"):
        genre_index = genre_index_by_name[genre]
        movie_features[i, genre_index] = 1

# 3. Xây dựng mô hình đề xuất film dựa trên Proximal Gradient Descent(PG Method)

In [None]:
import time
import pandas as pd
import tabml.datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error

class PGRegressor:
    def __init__(self, learning_rate=0.1, regularization_term=0.1, max_iter=1000, tol=1e-3):
        self.learning_rate = learning_rate
        self.regularization_term = regularization_term
        self.max_iter = max_iter
        self.tol = tol
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        num_samples, num_features = X.shape

        self.coef_ = np.zeros(num_features)
        for _ in range(self.max_iter):
            gradients = compute_gradients(X, y, self.coef_)
            proximal_term = self.learning_rate * self.regularization_term * np.sign(self.coef_)
            self.coef_ = proximal_operator(self.coef_ - self.learning_rate * gradients, proximal_term)

            if np.linalg.norm(gradients) < self.tol:
                break

        self.intercept_ = self.coef_[0]
        self.coef_ = self.coef_[1:]

    def predict(self, X):
        return X @ self.coef_ + self.intercept_

def compute_gradients(X, y, theta):
    m = len(y)
    predictions = X.dot(theta)
    errors = predictions - y
    gradients = (1/m) * X.T.dot(errors)
    return gradients

def proximal_operator(u, proximal_term):
    return np.sign(u) * np.maximum(0, np.abs(u) - proximal_term)

# Your existing code

df_dict = tabml.datasets.download_movielen_1m()
users, movies, ratings = df_dict["users"], df_dict["movies"], df_dict["ratings"]

train_ratings, validation_ratings = train_test_split(
    ratings, test_size=0.1, random_state=42
)

users_in_validation = validation_ratings["UserID"].unique()
all_users = users["UserID"].unique()

print(f"There are {len(users_in_validation)} users in the validation set.")
print(f"Total number of users: {len(all_users)}")

movie_index_by_id = {id: i for i, id in enumerate(movies["MovieID"])}

genres = [
    "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
genre_index_by_name = {name: i for i, name in enumerate(genres)}

movie_features = np.zeros((len(movies), len(genres)))
for i, movie_genres in enumerate(movies["Genres"]):
    for genre in movie_genres.split("|"):
        genre_index = genre_index_by_name[genre]
        movie_features[i, genre_index] = 1

def train_user_model(user_id):
    user_ratings = train_ratings[train_ratings["UserID"] == user_id]
    movie_indexes = [movie_index_by_id[movie_id] for movie_id in user_ratings["MovieID"]]
    train_data = movie_features[movie_indexes]
    train_label = user_ratings["Rating"]
    model = PGRegressor(learning_rate=0.1, regularization_term=0.1, max_iter=1000, tol=1e-3)
    model.fit(train_data, train_label)
    return model

user_model_dict = {}
for user_id in users["UserID"].unique():
    user_model_dict[user_id] = train_user_model(user_id)

def predict(user_id, movie_id):
    movie_feature = movie_features[movie_index_by_id[movie_id]].reshape((1, -1))
    pred = user_model_dict[user_id].predict(movie_feature)
    return min(max(pred, 1), 5)

print(f"RMSE train: {eval_rmse(train_ratings)}")
print(f"RMSE validation: {eval_rmse(validation_ratings)}")

end_time = time.time()
execution_time = end_time - start_time

print(f"Thời gian chạy của chương trình là: {execution_time} giây.")


There are 5970 users in the validation set.
Total number of users: 6040
RMSE train: 1.4367441863247723
RMSE validation: 1.5659809079196119
Thời gian chạy của chương trình là: 2892.0366683006287 giây.


# 4. Thực hiện kiểm tra mô hình với id người dùng 160(user_id = 160)

In [None]:
user_id = 160
for genre, coef in zip(genres, user_model_dict[user_id].coef_):
    print("{:15s}: {:.3f}".format(genre, coef))

Action         : -0.531
Adventure      : 0.989
Animation      : 0.124
Children's     : -0.219
Comedy         : -0.969
Crime          : 0.438
Documentary    : 0.000
Drama          : 0.026
Fantasy        : -0.343
Film-Noir      : 0.000
Horror         : 0.018
Musical        : 0.124
Mystery        : 0.000
Romance        : 1.001
Sci-Fi         : -0.437
Thriller       : -0.988
War            : -1.993
Western        : 0.000
