In [143]:
from bleach import clean
from scipy.sparse import csr_matrix
import pandas as pd
from scipy.sparse.linalg import svds
import numpy as np

In [144]:
df = pd.read_csv('data/clean_data.csv')
clean_data = df[['ID', 'ProdID', 'Rating']]
clean_data




Unnamed: 0,ID,ProdID,Rating
0,1.705737e+09,2.0,0.0
1,9.500000e+01,76.0,0.0
2,8.000000e+00,8.0,4.5
3,4.000000e+00,3.0,0.0
4,9.900000e+02,3.0,0.0
...,...,...,...
4953,2.771000e+03,208.0,4.5
4954,0.000000e+00,1.0,3.9
4955,3.400000e+01,96.0,0.0
4956,9.000000e+00,7.0,0.0


In [145]:
user_item_matrix = clean_data.pivot_table(index='ID', columns='ProdID', values='Rating', fill_value=0)
user_item_matrix.shape

(1712, 1679)

In [146]:
user_item_matrix.head()

ProdID,0.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,...,1.007940e+42,1.007940e+42,1.008730e+42,1.030521e+42,1.030521e+42,1.030521e+42,1.076430e+42,3.002240e+42,5.002240e+42,5.005509e+42
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,2.84,1.376923,1.666667,1.5875,1.775,0.0,2.1125,1.975,1.0,1.011111,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
1.0,1.72,0.0,2.122222,2.753846,0.7,2.333333,3.325,2.35,0.0,2.422222,...,0.0,4.7,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
2.0,1.285714,1.88,3.0625,1.58,2.663636,1.4,0.766667,2.233333,1.336364,1.9,...,0.0,2.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.6
3.0,1.74,1.125,1.583333,2.675,0.785714,3.314286,1.433333,0.0,0.0,3.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,3.525,0.65,2.4625,1.96,1.55,1.942857,0.45,2.714286,1.866667,3.82,...,0.0,1.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
U, Sigma, Vt = np.linalg.svd(user_item_matrix, full_matrices=False)
# Xem kích thước của các ma trận
print("U shape:", U.shape)
print("Sigma shape:", Sigma.shape)
print("Vt shape:", Vt.shape)

U shape: (1712, 1679)
Sigma shape: (1679,)
Vt shape: (1679, 1679)


In [148]:
k = 650

U_k = U[:, :k]
Sigma_k = Sigma[:k]
Vt_k = Vt[:k, :]

In [149]:
print("U shape:", U_k.shape)
print("Sigma shape:", Sigma_k.shape)
print("Vt shape:", Vt_k.shape)

U shape: (1712, 650)
Sigma shape: (650,)
Vt shape: (650, 1679)


In [150]:
# Tạo ma trận ước tính với k yếu tố tiềm ẩn
Sigma_k_matrix = np.diag(Sigma_k)
user_item_matrix_approx_k = np.dot(np.dot(U_k, Sigma_k_matrix), Vt_k)

# Xem qua ma trận ước tính với k yếu tố tiềm ẩn
user_item_matrix_approx_k 

array([[ 2.84000000e+00,  1.37692308e+00,  1.66666667e+00, ...,
         0.00000000e+00, -9.71553567e-16,  7.89136551e-16],
       [ 1.72000000e+00, -8.77856815e-15,  2.12222222e+00, ...,
         0.00000000e+00, -1.30624678e-15, -2.73538787e-15],
       [ 1.28571429e+00,  1.88000000e+00,  3.06250000e+00, ...,
         0.00000000e+00, -8.50014503e-16,  4.60000000e+00],
       ...,
       [ 0.00000000e+00,  1.66774205e-32,  2.58020562e-32, ...,
         0.00000000e+00, -4.81867672e-32, -3.88006053e-17],
       [ 6.57243357e-16,  8.63567030e-16,  1.78933338e-15, ...,
         0.00000000e+00, -5.07840298e-16,  7.28643675e-17],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],
      shape=(1712, 1679))

In [151]:
# Clip giá trị trong phạm vi [1, 5]
user_item_matrix_approx_k_clipped = np.clip(user_item_matrix_approx_k, 0, 5)
user_item_matrix_approx_k_clipped

array([[2.84000000e+00, 1.37692308e+00, 1.66666667e+00, ...,
        0.00000000e+00, 0.00000000e+00, 7.89136551e-16],
       [1.72000000e+00, 0.00000000e+00, 2.12222222e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.28571429e+00, 1.88000000e+00, 3.06250000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 4.60000000e+00],
       ...,
       [0.00000000e+00, 1.66774205e-32, 2.58020562e-32, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.57243357e-16, 8.63567030e-16, 1.78933338e-15, ...,
        0.00000000e+00, 0.00000000e+00, 7.28643675e-17],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
      shape=(1712, 1679))

In [152]:
user_item_matrix_approx_k_clipped = pd.DataFrame(
    user_item_matrix_approx_k_clipped,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

# Hiển thị DataFrame ước tính
user_item_matrix_approx_k_clipped.head()

ProdID,0.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,...,1.007940e+42,1.007940e+42,1.008730e+42,1.030521e+42,1.030521e+42,1.030521e+42,1.076430e+42,3.002240e+42,5.002240e+42,5.005509e+42
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,2.84,1.376923,1.666667,1.5875,1.775,0.0,2.1125,1.975,1.0,1.011111,...,0.0,2.254165e-15,2.647961e-22,4.236303e-15,0.0,2.364103e-15,5.0,0.0,0.0,7.891366e-16
1.0,1.72,0.0,2.122222,2.753846,0.7,2.333333,3.325,2.35,0.0,2.422222,...,0.0,4.7,1.118279e-22,6.556713e-16,5.0,0.0,0.0,0.0,0.0,0.0
2.0,1.285714,1.88,3.0625,1.58,2.663636,1.4,0.766667,2.233333,1.336364,1.9,...,0.0,2.35,0.0,3.295758e-15,0.0,4.953069e-15,0.0,0.0,0.0,4.6
3.0,1.74,1.125,1.583333,2.675,0.785714,3.314286,1.433333,0.0,0.0,3.54,...,0.0,5.79788e-15,0.0,2.06985e-15,0.0,4.701101e-16,0.0,0.0,0.0,0.0
4.0,3.525,0.65,2.4625,1.96,1.55,1.942857,0.45,2.714286,1.866667,3.82,...,0.0,1.333333,0.0,2.99847e-15,0.0,0.0,0.0,0.0,0.0,2.715249e-15


In [153]:
def recommend_products_svd(user_id, clean_data, user_item_matrix, user_item_matrix_approx_k_clipped, top_n=10):
    
    user_index = clean_data[clean_data['ID'] == user_id].index[0]
    # Bước 2: Lấy các sản phẩm mà người dùng chưa đánh giá
    unrated_products = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] == 0].index
    predicted_ratings = {}
    for product in unrated_products:
        predicted_ratings[product] = user_item_matrix_approx_k_clipped.loc[user_index, product]
    
    # Bước 4: Sắp xếp các sản phẩm theo rating dự đoán giảm dần
    sorted_predicted_ratings = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
    
    # Bước 5: Trả về danh sách (prod_id, predicted_rating)
    recommendations = sorted_predicted_ratings[:top_n]
    return recommendations


In [154]:
user_id = 95

# Số lượng sản phẩm cần gợi ý
top_n = 10

# Gọi hàm để gợi ý
top_recommendations = recommend_products_svd(
    user_id=user_id,
    clean_data=clean_data,
    user_item_matrix=user_item_matrix,
    user_item_matrix_approx_k_clipped=user_item_matrix_approx_k_clipped,
    top_n=top_n
)

# Hiển thị kết quả
print("Top sản phẩm gợi ý:")
for prod_id, rating in top_recommendations:
    print(f"Product ID: {prod_id}, Predicted Rating: {rating:.2f}")

Top sản phẩm gợi ý:
Product ID: 70.0, Predicted Rating: 5.00
Product ID: 87.0, Predicted Rating: 5.00
Product ID: 407.0, Predicted Rating: 5.00
Product ID: 608.0, Predicted Rating: 5.00
Product ID: 63173.0, Predicted Rating: 5.00
Product ID: 1.0305210044194794e+42, Predicted Rating: 5.00
Product ID: 114.0, Predicted Rating: 4.90
Product ID: 67088.0, Predicted Rating: 4.90
Product ID: 456.0, Predicted Rating: 4.80
Product ID: 507.0, Predicted Rating: 4.80


In [155]:
def calculate_ratio(top_recommendations, lowerbound_rating=3.5):
    # Bước 1: Đếm số lượng sản phẩm có rating > lowerbound_rating
    count_above_lowerbound = sum(1 for _, rating in top_recommendations if rating > lowerbound_rating)
    
    # Bước 2: Tính tỷ lệ
    ratio = count_above_lowerbound / len(top_recommendations) if len(top_recommendations) > 0 else 0
    
    return ratio

In [156]:
ratio = calculate_ratio(top_recommendations)

# Hiển thị kết quả
print(f"Ratio of products useful: {ratio:.2f}")

Ratio of products useful: 1.00


In [157]:
def evaluate_all_users(clean_data, user_item_matrix, user_item_matrix_approx_k_clipped, top_n=10, lowerbound_rating=3):
    # List to store the ratios for all users
    ratios = []
    
    # Loop through all unique user IDs in clean_data['ID']
    for user_id in clean_data['ID'].unique():
        try:
            # Generate top recommendations for the current user
            top_recommendations = recommend_products_svd(
                user_id=user_id,
                clean_data=clean_data,
                user_item_matrix=user_item_matrix,
                user_item_matrix_approx_k_clipped=user_item_matrix_approx_k_clipped,
                top_n=top_n
            )
            
            # Calculate the ratio for the current user's recommendations
            ratio = calculate_ratio(top_recommendations, lowerbound_rating)
            
            # Append the ratio to the list
            ratios.append(ratio)
        
        except KeyError as e:
            print(f"User ID {user_id} caused a KeyError: {e}") # Add None for users with issues
            
        except Exception as e:
            print(f"An error occurred for User ID {user_id}: {e}")
    
    # Filter out None values from the ratios list
    valid_ratios = [r for r in ratios if r is not None]
    
    # Print all ratios
    print("Ratios for each user:", valid_ratios)
    
    # Create a pandas Series for statistical description
    ratios_df = pd.Series(valid_ratios)
    
    # Print descriptive statistics for the ratios
    print("\nDescriptive Statistics:")
    print(ratios_df.describe())
    
    return ratios_df

# Example usage:
ratios_df = evaluate_all_users(clean_data, user_item_matrix, user_item_matrix_approx_k_clipped)

User ID 2324.0 caused a KeyError: np.int64(104)
User ID 80.0 caused a KeyError: np.int64(109)
User ID 43.0 caused a KeyError: np.int64(112)
User ID 4996044151.0 caused a KeyError: np.int64(117)
User ID 73.0 caused a KeyError: np.int64(118)
User ID 490.0 caused a KeyError: np.int64(119)
User ID 306.0 caused a KeyError: np.int64(122)
User ID 99.0 caused a KeyError: np.int64(124)
User ID 826.0 caused a KeyError: np.int64(128)
User ID 30.0 caused a KeyError: np.int64(135)
User ID 25182956.0 caused a KeyError: np.int64(137)
User ID 14.0 caused a KeyError: np.int64(140)
User ID 51768.0 caused a KeyError: np.int64(142)
User ID 92592.0 caused a KeyError: np.int64(145)
User ID 6956489.0 caused a KeyError: np.int64(146)
User ID 202.0 caused a KeyError: np.int64(148)
User ID 167380.0 caused a KeyError: np.int64(149)
User ID 987.0 caused a KeyError: np.int64(151)
User ID 90.0 caused a KeyError: np.int64(155)
User ID 53.0 caused a KeyError: np.int64(163)
User ID 21637.0 caused a KeyError: np.int64(