In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from scipy.sparse import coo_matrix
from spacy.cli.train import train

In [2]:
df = pd.read_csv('data/clean_data.csv')

In [3]:
clean_data = df[['ID', 'ProdID', 'Rating']]
clean_data   

Unnamed: 0,ID,ProdID,Rating
0,1.705737e+09,2.0,0.0
1,9.500000e+01,76.0,0.0
2,8.000000e+00,8.0,4.5
3,4.000000e+00,3.0,0.0
4,9.900000e+02,3.0,0.0
...,...,...,...
4953,2.771000e+03,208.0,4.5
4954,0.000000e+00,1.0,3.9
4955,3.400000e+01,96.0,0.0
4956,9.000000e+00,7.0,0.0


In [4]:
user_item_matrix = clean_data.pivot_table(index='ID', columns='ProdID', values='Rating', fill_value=0)
user_item_matrix.head()

ProdID,0.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,...,1.007940e+42,1.007940e+42,1.008730e+42,1.030521e+42,1.030521e+42,1.030521e+42,1.076430e+42,3.002240e+42,5.002240e+42,5.005509e+42
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,2.84,1.376923,1.666667,1.5875,1.775,0.0,2.1125,1.975,1.0,1.011111,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
1.0,1.72,0.0,2.122222,2.753846,0.7,2.333333,3.325,2.35,0.0,2.422222,...,0.0,4.7,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
2.0,1.285714,1.88,3.0625,1.58,2.663636,1.4,0.766667,2.233333,1.336364,1.9,...,0.0,2.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.6
3.0,1.74,1.125,1.583333,2.675,0.785714,3.314286,1.433333,0.0,0.0,3.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,3.525,0.65,2.4625,1.96,1.55,1.942857,0.45,2.714286,1.866667,3.82,...,0.0,1.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
non_zero_values = (user_item_matrix > 0).sum().sum()  # Tính tổng số giá trị khác 0 (có rating)
total_values = user_item_matrix.size  # Tổng số ô trong ma trận (tất cả các đánh giá của người dùng)

# Tính mật độ thưa
density = non_zero_values / total_values

# In kết quả mật độ thưa
print(f"Density of the user-item matrix: {density:.4f}")

Density of the user-item matrix: 0.0007


In [6]:
user_means = user_item_matrix.mean(axis=1)

# Trừ đi giá trị trung bình của mỗi người dùng từ tất cả các rating của họ
normalized_user_item_matrix = user_item_matrix.subtract(user_means, axis=0)

# Kiểm tra ma trận chuẩn hóa
normalized_user_item_matrix.head()

ProdID,0.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,...,1.007940e+42,1.007940e+42,1.008730e+42,1.030521e+42,1.030521e+42,1.030521e+42,1.076430e+42,3.002240e+42,5.002240e+42,5.005509e+42
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,2.692515,1.229438,1.519181,1.440015,1.627515,-0.147485,1.965015,1.827515,0.852515,0.863626,...,-0.147485,-0.147485,-0.147485,-0.147485,-0.147485,-0.147485,4.852515,-0.147485,-0.147485,-0.147485
1.0,1.591306,-0.128694,1.993529,2.625153,0.571306,2.20464,3.196306,2.221306,-0.128694,2.293529,...,-0.128694,4.571306,-0.128694,-0.128694,4.871306,-0.128694,-0.128694,-0.128694,-0.128694,-0.128694
2.0,1.177421,1.771706,2.954206,1.471706,2.555343,1.291706,0.658373,2.12504,1.22807,1.791706,...,-0.108294,2.241706,-0.108294,-0.108294,-0.108294,-0.108294,-0.108294,-0.108294,-0.108294,4.491706
3.0,1.614401,0.999401,1.457735,2.549401,0.660116,3.188687,1.307735,-0.125599,-0.125599,3.414401,...,-0.125599,-0.125599,-0.125599,-0.125599,-0.125599,-0.125599,-0.125599,-0.125599,-0.125599,-0.125599
4.0,3.398908,0.523908,2.336408,1.833908,1.423908,1.816765,0.323908,2.588194,1.740575,3.693908,...,-0.126092,1.207242,-0.126092,-0.126092,-0.126092,-0.126092,-0.126092,-0.126092,-0.126092,-0.126092


In [7]:
user_item_matrix_np = normalized_user_item_matrix.values

# Tính toán cosine similarity giữa các người dùng
cosine_similarities = cosine_similarity(user_item_matrix_np)

# Chuyển đổi cosine_similarities thành DataFrame để dễ dàng kiểm tra
cosine_similarities_df = pd.DataFrame(cosine_similarities, index=user_item_matrix.index, columns=user_item_matrix.index)

# Hiển thị kết quả
cosine_similarities_df.head()

ID,0.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,9.000000e+00,...,2.798841e+12,3.517758e+12,4.799034e+12,8.186981e+12,8.684094e+12,9.967409e+13,6.094514e+14,3.007694e+17,3.081297e+19,8.148290e+23
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1.0,0.05678,0.074674,0.080071,0.051679,0.056129,0.062366,0.073664,0.081859,0.040365,...,0.027585,0.049157,0.027585,0.0,-0.004772,-0.004772,0.0,-0.004772,0.027585,0.0
1.0,0.05678,1.0,0.07005,0.114,0.090716,0.080503,0.155968,0.053271,0.149287,0.067203,...,-0.004379,0.06784,-0.004379,0.0,-0.004379,-0.004379,0.0,-0.004379,-0.004379,0.0
2.0,0.074674,0.07005,1.0,0.032574,0.038201,0.059718,0.098394,0.081552,0.072064,0.04778,...,0.046556,0.111993,0.046556,0.0,-0.004105,-0.004105,0.0,-0.004105,0.046556,0.0
3.0,0.080071,0.114,0.032574,1.0,0.092033,0.064611,0.106611,0.085768,0.069912,0.058103,...,-0.004253,0.049365,-0.004253,0.0,-0.004253,-0.004253,0.0,-0.004253,-0.004253,0.0
4.0,0.051679,0.090716,0.038201,0.092033,1.0,0.138002,0.13977,0.085871,0.085568,0.144743,...,0.060284,0.08092,0.060284,0.0,-0.004367,-0.004367,0.0,-0.004367,0.060284,0.0


In [8]:
def predict_ratings(user_id, clean_data, user_item_matrix, cosine_similarities_df, k=10, number_of_products=10):
    # Bước 1: Lấy index của người dùng từ clean_data
    user_index = clean_data[clean_data['ID'] == user_id].index[0]
    
    # Bước 2: Lấy k người dùng gần nhất từ cosine similarity
    similar_users = cosine_similarities_df.iloc[user_index].sort_values(ascending=False).iloc[1:k+1].index.tolist()
    similarity_scores = cosine_similarities_df.iloc[user_index].sort_values(ascending=False).iloc[1:k+1].values
    
    # Bước 3: Lấy các sản phẩm mà user chưa đánh giá
    unrated_products = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] == 0.0].index
    
    predicted_ratings = {}
    
    # Bước 4: Dự đoán rating cho từng sản phẩm chưa đánh giá
    for product in unrated_products:
        numerator = 0
        denominator = 0
        for similar_user, similarity_score in zip(similar_users, similarity_scores):
            # Lấy rating của người dùng tương tự cho sản phẩm hiện tại
            similar_user_rating = user_item_matrix.loc[similar_user, product]
            if similar_user_rating > 0:  # Chỉ xem xét sản phẩm mà người dùng tương tự đã đánh giá
                numerator += similarity_score * similar_user_rating
                denominator += abs(similarity_score)
        
        # Nếu denominator != 0, tính rating dự đoán
        if denominator != 0:
            predicted_ratings[product] = numerator / denominator
        else:
            predicted_ratings[product] = 0  # Nếu không có rating từ người dùng tương tự, gán 0
    
    # Bước 5: Sắp xếp các sản phẩm theo predicted rating giảm dần và lấy số lượng sản phẩm theo number_of_products
    sorted_predicted_ratings = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
    
    # Trả về số lượng sản phẩm được yêu cầu (number_of_products)
    top_recommendations = sorted_predicted_ratings[:number_of_products]
    
    # Trả về kết quả (prod_id, predicted_rating)
    return top_recommendations

# Ví dụ sử dụng hàm cho một user_id cụ thể và số lượng sản phẩm gợi ý
user_id = 95  # Thay thế bằng user_id cần dự đoán
number_of_products = 10  # Số lượng sản phẩm cần gợi ý
top_recommendations = predict_ratings(user_id, clean_data, user_item_matrix, cosine_similarities_df, k=10, number_of_products=number_of_products)

# Hiển thị kết quả
top_recommendations


[(37.0, np.float64(5.0)),
 (98.0, np.float64(5.0)),
 (655.0, np.float64(5.0)),
 (4759.0, np.float64(5.0)),
 (6468.0, np.float64(5.0)),
 (60869.0, np.float64(5.0)),
 (3532620.0, np.float64(5.0)),
 (2135948483.0, np.float64(5.0)),
 (9372895838461112.0, np.float64(5.0)),
 (15.0, np.float64(4.8))]

In [9]:
def calculate_ratio(top_recommendations, lowerbound_rating=3.5):
    # Bước 1: Đếm số lượng sản phẩm có rating > lowerbound_rating
    count_above_lowerbound = sum(1 for _, rating in top_recommendations if rating > lowerbound_rating)
    
    # Bước 2: Tính tỷ lệ
    ratio = count_above_lowerbound / len(top_recommendations) if len(top_recommendations) > 0 else 0
    
    return ratio

In [10]:
ratio = calculate_ratio(top_recommendations)

# Hiển thị kết quả
print(f"Ratio of products useful: {ratio:.2f}")

Ratio of products useful: 1.00


In [11]:

def evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=10, lowerbound_rating=3, top_n =10 ):
    ratios = []  # Danh sách lưu các tỷ lệ
    
    # Vòng lặp qua các user_id trong clean_data
    for idx in range(len(clean_data['ID'].unique())-1): 
        user_id = clean_data.iloc[idx]['ID']  # Lấy user_id từ clean_data
        
        # Tính top recommendations cho user_id
        top_recommendations = predict_ratings(user_id, clean_data, user_item_matrix, cosine_similarities_df, k=k , number_of_products=top_n)
        
        # Tính tỷ lệ cho top recommendations
        ratio = calculate_ratio(top_recommendations, lowerbound_rating)
        
        # Thêm tỷ lệ vào danh sách
        ratios.append(ratio)
    
    
    # Tạo một DataFrame từ mảng ratios để sử dụng describe()
    ratios_df = pd.Series(ratios)
    
    # Sử dụng describe() để có thông tin thống kê mô tả
    print(ratios_df.describe())
    
    return ratios_df




In [12]:
ratios_df = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=10, lowerbound_rating=3 , top_n =10)

count    1711.000000
mean        0.163296
std         0.279175
min         0.000000
25%         0.000000
50%         0.000000
75%         0.200000
max         1.000000
dtype: float64


In [12]:
ratios_df2 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=3, lowerbound_rating=3 , top_n =10)

count    1711.000000
mean        0.089831
std         0.186884
min         0.000000
25%         0.000000
50%         0.000000
75%         0.100000
max         1.000000
dtype: float64


In [13]:
ratios_df3 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=5, lowerbound_rating=3 , top_n =10)

count    1711.000000
mean        0.122501
std         0.241515
min         0.000000
25%         0.000000
50%         0.000000
75%         0.100000
max         1.000000
dtype: float64


In [14]:
ratios_df5 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=3, lowerbound_rating=3 , top_n =5)

count    1711.000000
mean        0.147516
std         0.237096
min         0.000000
25%         0.000000
50%         0.000000
75%         0.200000
max         1.000000
dtype: float64


In [15]:
ratios_df6 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=5, lowerbound_rating=3 , top_n =5)

count    1711.000000
mean        0.185389
std         0.290105
min         0.000000
25%         0.000000
50%         0.000000
75%         0.200000
max         1.000000
dtype: float64


In [16]:
ratios_df8 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=10, lowerbound_rating=3 , top_n =5)

count    1711.000000
mean        0.247341
std         0.347662
min         0.000000
25%         0.000000
50%         0.000000
75%         0.400000
max         1.000000
dtype: float64


In [17]:
ratios_df9 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=15, lowerbound_rating=3 , top_n =10)

count    1711.000000
mean        0.212098
std         0.347892
min         0.000000
25%         0.000000
50%         0.000000
75%         0.200000
max         1.000000
dtype: float64


In [18]:
ratios_df10 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=20, lowerbound_rating=3 , top_n =10)

count    1711.000000
mean        0.220514
std         0.364066
min         0.000000
25%         0.000000
50%         0.000000
75%         0.200000
max         1.000000
dtype: float64


In [19]:
ratios_df11 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=15, lowerbound_rating=3 , top_n =5)

count    1711.000000
mean        0.272589
std         0.382348
min         0.000000
25%         0.000000
50%         0.000000
75%         0.400000
max         1.000000
dtype: float64


In [13]:
ratios_df12 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=20, lowerbound_rating=3 , top_n =5)

count    1711.000000
mean        0.272589
std         0.382348
min         0.000000
25%         0.000000
50%         0.000000
75%         0.400000
max         1.000000
dtype: float64


In [13]:
ratios_df13 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=3, lowerbound_rating=3 , top_n =20)

count    1711.000000
mean        0.057481
std         0.161940
min         0.000000
25%         0.000000
50%         0.000000
75%         0.050000
max         1.000000
dtype: float64


In [14]:
ratios_df14 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=5, lowerbound_rating=3 , top_n =20)

count    1711.000000
mean        0.086178
std         0.219603
min         0.000000
25%         0.000000
50%         0.000000
75%         0.050000
max         1.000000
dtype: float64


In [15]:
ratios_df15 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=10, lowerbound_rating=3 , top_n =20)

count    1711.000000
mean        0.117446
std         0.256941
min         0.000000
25%         0.000000
50%         0.000000
75%         0.100000
max         1.000000
dtype: float64


In [16]:
ratios_df16 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=15, lowerbound_rating=3 , top_n =20)

count    1711.000000
mean        0.146815
std         0.281350
min         0.000000
25%         0.000000
50%         0.000000
75%         0.100000
max         1.000000
dtype: float64


In [17]:
ratios_df17 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=20, lowerbound_rating=3 , top_n =20)

count    1711.000000
mean        0.178726
std         0.336768
min         0.000000
25%         0.000000
50%         0.000000
75%         0.100000
max         1.000000
dtype: float64


In [18]:
ratios_df18 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=50, lowerbound_rating=3 , top_n =5)

count    1711.000000
mean        0.532320
std         0.484691
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
dtype: float64


In [19]:
ratios_df19 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=50, lowerbound_rating=3 , top_n =10)

count    1711.000000
mean        0.512215
std         0.485458
min         0.000000
25%         0.000000
50%         0.500000
75%         1.000000
max         1.000000
dtype: float64


In [20]:
ratios_df20 = evaluate_all_users(clean_data, user_item_matrix, cosine_similarities_df, k=50, lowerbound_rating=3 , top_n =20)

count    1711.000000
mean        0.488486
std         0.481720
min         0.000000
25%         0.000000
50%         0.250000
75%         1.000000
max         1.000000
dtype: float64


In [22]:
def collaborative_filtering_recommendations(target_user_id, top_n=10 , k = 10 ):
    # Load the clean dataset
    df = pd.read_csv('data/clean_data.csv')
    clean_data = df[['ID', 'ProdID', 'Rating']]

    # Create the user-item matrix
    user_item_matrix = clean_data.pivot_table(index='ID', columns='ProdID', values='Rating', fill_value=0)
    user_means = user_item_matrix.mean(axis=1)

    # Normalize ratings by subtracting user means
    normalized_user_item_matrix = user_item_matrix.subtract(user_means, axis=0)
    user_item_matrix_np = normalized_user_item_matrix.values

    # Compute cosine similarity between users
    cosine_similarities = cosine_similarity(user_item_matrix_np)

    # Convert cosine similarities into a DataFrame
    cosine_similarities_df = pd.DataFrame(cosine_similarities, index=user_item_matrix.index,
                                          columns=user_item_matrix.index)

    # Predict ratings using the predict_ratings function
    top_recommendations = predict_ratings(
        user_id=target_user_id,
        clean_data=clean_data,
        user_item_matrix=user_item_matrix,
        cosine_similarities_df=cosine_similarities_df,
        k=k,
        number_of_products=top_n
    )

    # Extract product IDs from the recommendations
    recommended_product_ids = [prod_id for prod_id, _ in top_recommendations]

    # Extract details of the recommended products from the original dataset
    recommended_item_indices = df[df['ProdID'].isin(recommended_product_ids)].index
    recommended_items_details = df.iloc[recommended_item_indices][
        ['Name', 'ImageURL', 'Brand', 'Rating', 'ReviewCount', 'Description', 'Price']
    ]

    # Return the detailed recommended items
    return recommended_items_details

In [23]:
recommend = collaborative_filtering_recommendations(1705736792)
recommend

Unnamed: 0,Name,ImageURL,Brand,Rating,ReviewCount,Description,Price
0,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",https://i5.walmartimages.com/asr/0e1f4c51-c1a4...,OPI,0.0,0.0,,8.95
7,BMC Bright and Loud Cream Gel Lacquer Polish S...,https://i5.walmartimages.com/asr/93f8bbf3-2dfd...,BMC,0.0,0.0,"Lights, Sick Beats, and Epic Gel Polish! We pr...",4.69
15,Clairol Natural Instincts Demi-Permanent Hair ...,https://i5.walmartimages.com/asr/00a6e54a-e431...,Clairol,3.7,2935.0,Discover your natural way to shine! Made with ...,6.99
20,LOreal Paris Excellence Creme Permanent Triple...,https://i5.walmartimages.com/asr/5ffb3626-4031...,L'Oreal Paris,4.0,6494.0,The Hair Color Kit from LOreal Paris makes it ...,7.91
22,"ACT Braces Care Anticavity Mouthwash (18 Oz, C...",https://i5.walmartimages.com/asr/a7fa6e41-316f...,ACT,5.0,32.0,Got braces? Start ACTing to help prevent cavit...,3.98
...,...,...,...,...,...,...,...
4920,Eminence Sun Defense Minerals Water-Resistant ...,https://i5.walmartimages.com/asr/32a62d01-d578...,Eminence Organic Skin Care,0.0,0.0,,46.91
4948,2 Pack - Crest 3D White Brilliance Fluoride An...,https://i5.walmartimages.com/asr/5758fbaf-5515...,Crest,4.7,43.0,Pack of 2 for the UPC: 037000943822 Product de...,19.96
4954,Garnier Nutrisse Nourishing Hair Color Creme (...,https://i5.walmartimages.com/asr/24d7a837-51f8...,Garnier,3.9,7484.0,Garnier Nutrisse Nourishing Hair Color Creme B...,4.44
4956,Creed Love In Black Hair And Body Wash 6.8oz/2...,https://i5.walmartimages.com/asr/3dc99239-66d2...,Creed,0.0,0.0,,32.99
