In [3]:
%pip install tensorflow keras pandas numpy scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [7]:
# ƒê·ªçc d·ªØ li·ªáu MovieLens 1M
file_path = r"C:\Thesis\Sample Project\ratings.dat"
column_names = ["user_id", "item_id", "rating", "timestamp"]
data = pd.read_csv(file_path, sep='::', names=column_names, engine='python')

data = data.drop(columns=["timestamp"])  # B·ªè c·ªôt timestamp

In [10]:

# Load th√™m d·ªØ li·ªáu ng∆∞·ªùi d√πng v√† phim
users = pd.read_csv(r"C:\Thesis\Sample Project\users.dat", sep='::', engine='python',
                    names=["user_id", "gender", "age", "occupation", "zip"], encoding='latin-1')
movies = pd.read_csv(r"C:\Thesis\Sample Project\movies.dat", sep='::', engine='python',
                     names=["item_id", "title", "genres"], encoding='latin-1') 

# Encode c√°c c·ªôt ph√¢n lo·∫°i
from sklearn.preprocessing import LabelEncoder
gender_enc = LabelEncoder()
occupation_enc = LabelEncoder()
age_enc = LabelEncoder()
genre_enc = LabelEncoder()

users["gender"] = gender_enc.fit_transform(users["gender"])
users["occupation"] = occupation_enc.fit_transform(users["occupation"])
users["age"] = age_enc.fit_transform(users["age"])

# V·ªõi genres c√≥ nhi·ªÅu th·ªÉ lo·∫°i, t·∫°m th·ªùi ch·ªâ l·∫•y th·ªÉ lo·∫°i ƒë·∫ßu ti√™n
movies["genres"] = movies["genres"].apply(lambda x: x.split('|')[0])
movies["genres"] = genre_enc.fit_transform(movies["genres"])

# G·ªôp d·ªØ li·ªáu l·∫°i
data = data.merge(users, on="user_id")
data = data.merge(movies, on="item_id")


In [11]:

# M√£ h√≥a ID ng∆∞·ªùi d√πng v√† phim
user_encoder = LabelEncoder()
data["user_id"] = user_encoder.fit_transform(data["user_id"])

item_encoder = LabelEncoder()
data["item_id"] = item_encoder.fit_transform(data["item_id"])

num_users = data["user_id"].nunique()
num_items = data["item_id"].nunique()
# Chia t·∫≠p hu·∫•n luy·ªán v√† ki·ªÉm tra
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
# X√¢y d·ª±ng m√¥ h√¨nh DeepFM (ch·ªâ gi·ªØ ph·∫ßn DeepFM)
def build_deepfm(num_users, num_items, embedding_dim=8, hidden_dims=[32, 16], dropout=0.5):
    # Input layers
    user_input = layers.Input(shape=(1,), name='user_input')
    item_input = layers.Input(shape=(1,), name='item_input')

    # Embedding layers
    user_emb = layers.Embedding(num_users, embedding_dim)(user_input)
    item_emb = layers.Embedding(num_items, embedding_dim)(item_input)

    # Flatten embeddings
    user_emb_flat = layers.Flatten()(user_emb)
    item_emb_flat = layers.Flatten()(item_emb)

    # ‚≠ê Th√™m ph·∫ßn FM (Factorization Machine) ‚≠ê
    interaction = layers.Dot(axes=1)([user_emb_flat, item_emb_flat])# Nh√¢n ch√©o embedding

    # Deep component (DNN)
    concat_features = layers.Concatenate()([user_emb_flat, item_emb_flat])
    dnn = concat_features
    for dim in hidden_dims:  # üîß S·ª≠a ƒë·ªïi: DNN t·ªïng qu√°t h∆°n v·ªõi danh s√°ch hidden_dims
        dnn = layers.Dense(dim, activation='relu')(dnn)
        dnn = layers.Dropout(dropout)(dnn)
    dnn = layers.Dense(1)(dnn)

    # K·∫øt h·ª£p c√°c th√†nh ph·∫ßn
    output = layers.Add()([interaction, dnn])
    output = layers.Activation('sigmoid')(output)

    # X√¢y d·ª±ng m√¥ h√¨nh
    model = keras.Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC()])

    return model

In [14]:
# üîß S·ª≠a ƒë·ªïi: Cho ph√©p t√πy ch·ªânh embedding_dim, hidden_dims, v√† dropout
embedding_dim = 8
hidden_dims = [64,32]
dropout = 0.5

# Kh·ªüi t·∫°o m√¥ h√¨nh DeepFM
model = build_deepfm(num_users, num_items, embedding_dim, hidden_dims, dropout)

# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu ƒë·∫ßu v√†o th√†nh ƒë·ªãnh d·∫°ng ph√π h·ª£p
X_train = [train["user_id"].values, train["item_id"].values]
y_train = (train["rating"].values >= 4).astype(int)  # Chuy·ªÉn rating th√†nh nh√£n nh·ªã ph√¢n

X_test = [test["user_id"].values, test["item_id"].values]
y_test = (test["rating"].values >= 4).astype(int)  # Chuy·ªÉn rating th√†nh nh√£n nh·ªã ph√¢n
# Hu·∫•n luy·ªán m√¥ h√¨nh
history = model.fit(X_train, y_train, batch_size=256, epochs=10, validation_data=(X_test, y_test), verbose=1)  # ‚≠ê Gi·ªØ nguy√™n c√°ch g·ªçi c·ªßa file g·ªëc ‚≠ê

# ƒê√°nh gi√° m√¥ h√¨nh
loss, accuracy, auc = model.evaluate(X_test, y_test)  # ‚≠ê L·∫•y th√™m gi√° tr·ªã AUC ‚≠ê
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC: {auc:.4f}")  # ‚≠ê In ra AUC ‚≠ê



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5280
Test Accuracy: 0.7347
Test AUC: 0.8039


In [15]:
# L∆∞u m√¥ h√¨nh
model.save("deepfm_model.keras")