In [38]:
# 📦 Imports
import os
import random
import numpy as np
import pandas as pd
from scipy import sparse

import lightfm
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score

from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from config import DB_CONFIG
import psycopg2

# 🔌 Kết nối cơ sở dữ liệu và đọc dữ liệu
conn = psycopg2.connect(**DB_CONFIG)

query = """
    SELECT account_id, post_id, viewed, liked, comments, to_timestamp(created_at) AS created_at
    FROM account_reaction_posts
"""
df = pd.read_sql(query, conn)
conn.close()

# 👀 Hiển thị một vài dòng đầu tiên
df


  df = pd.read_sql(query, conn)


Unnamed: 0,account_id,post_id,viewed,liked,comments,created_at
0,7152,51799,8,True,4,2025-03-28 16:29:55+00:00
1,7152,53324,8,True,2,2025-04-06 15:03:00+00:00
2,7152,51861,6,True,3,2025-04-15 18:23:21+00:00
3,7152,53969,7,True,2,2025-04-06 16:15:46+00:00
4,7152,51322,10,True,2,2025-05-21 16:09:20+00:00
...,...,...,...,...,...,...
41040,7112,51760,2,False,1,2025-03-31 20:07:56+00:00
41041,7112,52231,3,False,1,2025-03-13 05:24:32+00:00
41042,7112,53980,3,False,1,2025-04-13 20:26:21+00:00
41043,7112,54931,1,False,1,2025-03-31 14:44:40+00:00


In [2]:
# df['account_id'].value_counts()

account_id
7263    183
7496    160
7315    156
7403    155
7152    154
       ... 
7576     35
7473     34
7512     30
7266     23
7381     20
Name: count, Length: 450, dtype: int64

In [3]:
# df['post_id'].value_counts()

post_id
54570    22
54709    21
55318    20
54154    20
52461    18
         ..
53837     1
51968     1
51808     1
50967     1
54179     1
Name: count, Length: 4839, dtype: int64

In [39]:
# Giả sử bạn đã có DataFrame df chứa các cột: account_id, post_id, viewed, liked, commented

# Mỗi cột boolean sẽ được chuyển thành int (True = 1, False = 0)
df['viewed'] = df['viewed'].astype(int)

# Tổng số tương tác (rating) = tổng các loại tương tác
df['rating'] = df['viewed'] + df['liked'] * 2 + df['comments'] * 3
# Group theo (user_id, post_id) để lấy tổng rating nếu có trùng lặp
df_rating = df.groupby(['account_id', 'post_id'], as_index=False)['rating'].sum()

# Kết quả là ma trận account_id - post_id - rating
display(df_rating)


Unnamed: 0,account_id,post_id,rating
0,7084,50971,20
1,7084,51018,20
2,7084,51083,14
3,7084,51090,21
4,7084,51251,15
...,...,...,...
41040,7583,55639,10
41041,7583,55666,9
41042,7583,55680,8
41043,7583,55714,9


In [40]:
df_playlist =df_rating.groupby('post_id').filter(lambda x: len(x)>=10)
df_playlist

Unnamed: 0,account_id,post_id,rating
4,7084,51251,15
9,7084,51360,16
10,7084,51438,20
14,7084,51579,22
16,7084,51666,21
...,...,...,...
41031,7583,54693,9
41033,7583,55308,5
41034,7583,55318,5
41042,7583,55680,8


In [41]:
df_playlist = df_rating[df_rating.groupby('account_id').post_id.transform('nunique')>=10]
df_playlist

Unnamed: 0,account_id,post_id,rating
0,7084,50971,20
1,7084,51018,20
2,7084,51083,14
3,7084,51090,21
4,7084,51251,15
...,...,...,...
41040,7583,55639,10
41041,7583,55666,9
41042,7583,55680,8
41043,7583,55714,9


In [42]:
conn = psycopg2.connect(**DB_CONFIG)

query = "SELECT id AS post_id, caption FROM posts"
df_title = pd.read_sql_query(query, conn)
titles = df_title['caption'].tolist()
post_ids = df_title['post_id'].tolist()

# 3. Tạo mappings
titles = df_title['caption'].tolist()
post_ids = df_title['post_id'].tolist()

normal_mapping = dict(zip(titles, post_ids))   # title -> post_id
reverse_mapping = dict(zip(post_ids, titles))  # post_id -> title

# 4. Đóng kết nối
conn.close()



  df_title = pd.read_sql_query(query, conn)


In [43]:
def create_interaction_matrix(df,account_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([account_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(account_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [44]:
def create_account_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        account_dict - Dictionary type output containing interaction_index as key and account_id as value
    '''
    account_id = list(interactions.index)
    account_dict = {}
    counter = 0 
    for i in account_id:
        account_dict[i] = counter
        counter += 1
    return account_dict


In [45]:
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict


In [46]:
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and account
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    
    #uncommented for train test split
#     x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model


In [47]:
def sample_recommendation_account(model, interactions, account_id, account_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce account recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - account_id = account ID for which we need to generate recommendation
        - account_dict = Dictionary type input containing interaction_index as key and account_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given account has already bought
        - Prints list of N recommended items  which account hopefully will be interested in
    '''
    n_accounts, n_items = interactions.shape
    account_x = account_dict[account_id]
    scores = pd.Series(model.predict(account_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[account_id,:] \
                                 [interactions.loc[account_id,:] > threshold].index) \
                       .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [48]:
df_rating

Unnamed: 0,account_id,post_id,rating
0,7084,50971,20
1,7084,51018,20
2,7084,51083,14
3,7084,51090,21
4,7084,51251,15
...,...,...,...
41040,7583,55639,10
41041,7583,55666,9
41042,7583,55680,8
41043,7583,55714,9


In [49]:
interactions = create_interaction_matrix(df = df_rating, account_col = "account_id", item_col = 'post_id', rating_col = 'rating', norm= False, threshold = None)
interactions.head()

post_id,50917,50918,50919,50920,50921,50922,50923,50924,50925,50926,...,55746,55747,55748,55749,55750,55751,55752,55753,55754,55755
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
account_dict = create_account_dict(interactions=interactions)
account_dict


{7084: 0,
 7085: 1,
 7087: 2,
 7088: 3,
 7089: 4,
 7090: 5,
 7091: 6,
 7092: 7,
 7093: 8,
 7094: 9,
 7095: 10,
 7096: 11,
 7097: 12,
 7098: 13,
 7099: 14,
 7100: 15,
 7101: 16,
 7102: 17,
 7103: 18,
 7104: 19,
 7105: 20,
 7106: 21,
 7107: 22,
 7108: 23,
 7109: 24,
 7110: 25,
 7111: 26,
 7112: 27,
 7113: 28,
 7114: 29,
 7115: 30,
 7116: 31,
 7117: 32,
 7118: 33,
 7120: 34,
 7121: 35,
 7122: 36,
 7123: 37,
 7124: 38,
 7125: 39,
 7126: 40,
 7127: 41,
 7128: 42,
 7130: 43,
 7131: 44,
 7132: 45,
 7133: 46,
 7134: 47,
 7135: 48,
 7136: 49,
 7138: 50,
 7139: 51,
 7140: 52,
 7141: 53,
 7143: 54,
 7145: 55,
 7148: 56,
 7149: 57,
 7150: 58,
 7152: 59,
 7153: 60,
 7154: 61,
 7156: 62,
 7157: 63,
 7158: 64,
 7159: 65,
 7160: 66,
 7161: 67,
 7162: 68,
 7163: 69,
 7164: 70,
 7166: 71,
 7167: 72,
 7169: 73,
 7170: 74,
 7171: 75,
 7172: 76,
 7174: 77,
 7175: 78,
 7176: 79,
 7177: 80,
 7178: 81,
 7179: 82,
 7182: 83,
 7183: 84,
 7184: 85,
 7185: 86,
 7187: 87,
 7188: 88,
 7189: 89,
 7190: 90,
 7191: 91

In [51]:
item_dict = reverse_mapping

In [52]:
x = sparse.csr_matrix(interactions.values)
train, test = lightfm.cross_validation.random_train_test_split(x, test_percentage=0.2, random_state=None)

In [53]:
%time
model = runMF(interactions = train,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 5.72 μs


In [54]:
train_auc = auc_score(model, train, num_threads=4).mean()
print('Train AUC: %s' % train_auc)


Train AUC: 0.96242255


In [55]:
test_auc = auc_score(model, test, train_interactions=train, num_threads=4).mean()
print('Test AUC: %s' % test_auc)

Test AUC: 0.96248955


In [56]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

In [57]:
print('train Precision %.2f, test Precision %.2f.' % (train_precision, test_precision))

train Precision 0.35, test Precision 0.16.


In [60]:
rec_list = sample_recommendation_account(model = model, 
                                      interactions = interactions, 
                                      account_id = 7084, 
                                      account_dict = account_dict,
                                      item_dict = item_dict, 
                                      threshold = 0,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- Rất tại vài đến dưới. Nhưng rất từng tôi. Đang nhiều với của bên có.
2- Các số thay điều tự để. Có sẽ nhưng lớn người rất.
3- Đến về điều gần giống từ vì. Mà một mỗi. Không có cũng dưới tự. Này cho từng tự.
4- Nơi nơi giữa rất chưa. Vẫn chưa cũng.
5- Giữa vậy về có để giữa vài. Các vài dưới sẽ để hơn.
6- Không có từ của khiến. Của đến hơn như như tại hơn. Như trong này để. Cũng để đi. Theo này của nơi về.
7- Thay tại cái đi tại. Để dưới giữa vậy. Với được nếu như vì như về. Chưa đi thì mà đến về. Điều dưới được thì từng nhiều cũng. Cách mà với nếu.
8- Về sau để như. Mỗi để như làm và trong nhưng. Đó mỗi và số hơn người trong với. Vì đó với vẫn được nào. Trong như với mỗi rất này cũng.
9- Có để gần rất lớn. Tại cũng số khiến sau. Về được cho các. Mỗi tôi tại vì chưa. Sau mỗi như giống của như từ giống.
10- Đó hơn giữa cách chưa nếu. Trong từng khi vẫn và giữa sau.
11- Có điều đúng hoặc. Khiến như bên của chỉ vài có. Gần vậy về của số từng. Thay sau cũng người số thì. Đún

In [61]:
import joblib

# Save the trained model
joblib.dump(model, 'lightfm_model.pkl')

# Save the necessary dictionaries and matrices
joblib.dump(account_dict, 'account_dict.pkl')
joblib.dump(item_dict, 'item_dict.pkl')
joblib.dump(interactions, 'interactions.pkl')

['interactions.pkl']