In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import scipy.sparse as sp
from torch.utils.data import TensorDataset, DataLoader
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from tqdm import tqdm

# import scrapper

In [2]:
csv_files = [
    "mdl_dramas_1.csv",      # <--- This one has the header
    "mdl_dramas_2.csv",    # <--- No header
    "mdl_dramas_3.csv",    # <--- No header
    "mdl_dramas_4.csv"     # <--- No header
]


df_first = pd.read_csv(csv_files[0])  # Has header
original_cols = df_first.columns      # Save the column names

df_rest = [
    pd.read_csv(f, header=None, names=original_cols)
    for f in csv_files[1:]            # 2nd, 3rd, 4th CSVs have no headers
]


df = pd.concat([df_first] + df_rest, ignore_index=True)

print("Columns in the combined DataFrame:", df.columns)
print("First 5 rows:\n", df.head(5))
print("Last 5 rows:\n", df.tail(5))


# If 'story' is supposed to be numeric, drop rows where story == "story"
if "story" in df.columns:
    # Only do this if there's actually a row that has "story" in 'story'
    if df["story"].dtype == object:
        df = df[df["story"] != "story"]

df.drop_duplicates(subset=["Username", "Title"], keep="first", inplace=True)


rating_cols = ["story", "acting", "music", "rewatch"]
df[rating_cols] = df[rating_cols].apply(pd.to_numeric, errors="coerce")
df.fillna(0, inplace=True)  # or any other strategy you prefer


df["Username"] = df["Username"].astype(str)
df["Title"] = df["Title"].astype(str)
user_encoder = LabelEncoder()
df["user_id"] = user_encoder.fit_transform(df["Username"])

title_encoder = LabelEncoder()
df["item_id"] = title_encoder.fit_transform(df["Title"])


onehot = OneHotEncoder(handle_unknown="ignore")
cat_data = onehot.fit_transform(df[["user_id", "item_id"]])

X = cat_data  # sparse matrix of user/item
Y = df[rating_cols].values.astype(np.float32)  # convert to float32

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print("=== FINAL DATA SHAPES ===")
print(f"Total records in DataFrame: {df.shape[0]}")
print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size:  {X_test.shape[0]}")

print("\nSample rating rows (Y):")
print(y_train[:5])


Columns in the combined DataFrame: Index(['Username', 'Title', 'Rating', 'overall_rating', 'story', 'acting',
       'music', 'rewatch'],
      dtype='object')
First 5 rows:
          Username                      Title  Rating  overall_rating  story  \
0   PrincessKayla  Bring the Soul: The Movie     9.3            10.0   10.0   
1         simi_24  Bring the Soul: The Movie     9.3            10.0   10.0   
2    coffwalk Yun  Bring the Soul: The Movie     9.3            10.0   10.0   
3  RainyDayCuppaT  Bring the Soul: The Movie     9.3            10.0   10.0   
4             Mei   Nana Tour with Seventeen     9.3            10.0   10.0   

   acting  music  rewatch  
0    10.0   10.0     10.0  
1    10.0    9.5     10.0  
2    10.0   10.0     10.0  
3    10.0   10.0     10.0  
4    10.0   10.0     10.0  
Last 5 rows:
               Username                      Title  Rating  overall_rating  \
106179  ColourMePurple  Shadow Detective Season 2     8.0             7.0   
106180      mi

In [3]:
X_train_dense = X_train.astype(np.float32)
X_test_dense  = X_test.astype(np.float32)

In [4]:
X_train_t = torch.sparse_coo_tensor(
    torch.tensor(X_train.nonzero(), dtype=torch.int64),  # Indices
    torch.tensor(X_train.data, dtype=torch.float32),  # Values
    torch.Size(X_train.shape)  # Shape
)

X_test_t = torch.sparse_coo_tensor(
    torch.tensor(X_test.nonzero(), dtype=torch.int64),
    torch.tensor(X_test.data, dtype=torch.float32),
    torch.Size(X_test.shape)
)

# Convert y_train/y_test to PyTorch tensors (no need for sparse)
y_train_t = torch.tensor(y_train, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.float32)

# Dataset Wrapper for Sparse Data
class SparseDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return self.X[idx].to_dense(), self.y[idx]  # Convert only one row to dense


  torch.tensor(X_train.nonzero(), dtype=torch.int64),  # Indices


In [5]:
class FactorizationMachineMultiTask(nn.Module):
    def __init__(self, n_features, k=20, n_outputs=4):
        super().__init__()
        self.linear = nn.Linear(n_features, n_outputs, bias=True)
        self.V = nn.Parameter(torch.randn(n_features, k, n_outputs) * 0.01)  # [n_features, k, n_outputs]
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        linear_part = self.linear(x)  # Shape: [batch_size, n_outputs]

        # 2) Compute separate interaction terms for each subrating
        xV = torch.einsum('bi,ikn->bkn', x, self.V)  # Shape: [batch_size, k, n_outputs]
        xV_square = xV * xV  # [batch_size, k, n_outputs]

        x_square = x * x  # [batch_size, n_features]
        V_square = self.V * self.V  # [n_features, k, n_outputs]
        x_square_V_square = torch.einsum('bi,ikn->bkn', x_square, V_square)  # [batch_size, k, n_outputs]

        # Compute per-subrating interaction scores
        interaction = 0.5 * torch.sum(xV_square - x_square_V_square, dim=1)  # [batch_size, n_outputs]
        out = linear_part + interaction  # [batch_size, n_outputs]
        out = 1 + 9 * self.sigmoid(out)
        return out


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert our X (sparse) to dense if necessary for the FM (or see note below)
# X_train_dense = X_train.toarray()  # shape [num_train, n_features]
# X_test_dense  = X_test.toarray()   # shape [num_test,  n_features]

# y_train_t = torch.tensor(y_train, dtype=torch.float32)  # [num_train, 4]
# y_test_t  = torch.tensor(y_test,  dtype=torch.float32)  # [num_test,  4]

# X_train_t = torch.tensor(X_train_dense, dtype=torch.float32)  # [num_train, n_features]
# X_test_t  = torch.tensor(X_test_dense,  dtype=torch.float32)  # [num_test,  n_features]

# train_dataset = TensorDataset(X_train_t, y_train_t)
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

train_dataset = SparseDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

# Instantiate the multi-task FM
n_features = X_train_dense.shape[1]  # (# users + # items) if using one-hot
fm_model = FactorizationMachineMultiTask(n_features=n_features, k=50, n_outputs=4)

criterion = nn.MSELoss()
optimizer = optim.Adam(fm_model.parameters(), lr=0.001)

num_epochs = 10
for epoch in tqdm(range(num_epochs), desc="Training Progress", dynamic_ncols=True):
    fm_model.train()
    epoch_loss = 0.0

    batch_iterator = tqdm(train_loader, total=len(train_loader),
                          desc=f"Epoch {epoch+1}/{num_epochs}", leave=False, dynamic_ncols=True)

    for batch_X, batch_y in batch_iterator:
        preds = fm_model(batch_X)   # shape: [batch_size, 4]
        loss = criterion(preds, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * batch_X.size(0)
        batch_iterator.set_postfix(loss=loss.item())

    epoch_loss /= len(train_loader.dataset)

    if (epoch + 1) % 2 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 2/10, Loss: 7.9400



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 4/10, Loss: 3.5201



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 6/10, Loss: 1.8958



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 

In [None]:
torch.save(fm_model.state_dict(), "fm_model.pth")

In [None]:
fm_model.eval()
test_preds_list = []
test_targets_list = []

test_loader = DataLoader(SparseDataset(X_test_t, y_test_t), batch_size=512, shuffle=False)

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        preds = fm_model(batch_X)  # Get predictions
        test_preds_list.append(preds)
        test_targets_list.append(batch_y)

# Concatenate all batches
test_preds = torch.cat(test_preds_list, dim=0)
test_targets = torch.cat(test_targets_list, dim=0)

# Compute loss
test_loss = criterion(test_preds, test_targets)

print(f"Test MSE across 4 subratings: {test_loss.item():.4f}")


In [None]:
import pandas as pd

def predict_subratings(user_name, drama_title, fm_model, user_encoder, title_encoder, onehot):
    if user_name not in user_encoder.classes_:
        print(f"Unknown user: {user_name}")
        return None
    if drama_title not in title_encoder.classes_:
        print(f"Unknown drama: {drama_title}")
        return None

    # Convert to encoded IDs
    user_id = user_encoder.transform([user_name])[0]
    item_id = title_encoder.transform([drama_title])[0]

    # Convert to a DataFrame to match fitted OneHotEncoder feature names
    row = pd.DataFrame([[user_id, item_id]], columns=["user_id", "item_id"])

    X_cat = onehot.transform(row)  # This now retains feature names
    X_dense = torch.tensor(X_cat.toarray(), dtype=torch.float32)

    # Predict with FM model
    fm_model.eval()
    with torch.no_grad():
        preds = fm_model(X_dense)

    return preds.view(-1).tolist()

predict_subratings("Tianqin", "Fall in Love", fm_model, user_encoder, title_encoder, onehot)

In [None]:
import torch
import numpy as np

def get_top_recommendations(user_name, fm_model, user_encoder, title_encoder, onehot, train_df, top_n=10):
    user_watched = train_df.groupby("Username")["Title"].apply(set).to_dict()

    if user_name not in user_encoder.classes_:
        print(f"Unknown user: {user_name}")
        return None

    recommendations = []

    watched_shows = user_watched.get(user_name, set())

    i = 0
    print(f"Predicting ratings for {user_name}...")
    print(f"Total dramas to predict: {len(title_encoder.classes_)}")

    for drama_title in title_encoder.classes_:
        # **Skip dramas the user has already watched**
        if drama_title in watched_shows:
            continue

        predicted = predict_subratings(user_name, drama_title, fm_model, user_encoder, title_encoder, onehot)
        if predicted is not None:
            avg_rating = np.mean(predicted)  # Aggregate subratings
            recommendations.append((drama_title, avg_rating))
            i += 1

        if i % 100 == 0 and recommendations:
            best_so_far = max(recommendations, key=lambda x: x[1])
            print(f"Processed {i} dramas.")
            print(f"Best so far: {best_so_far[0]} - Predicted Rating: {best_so_far[1]:.2f}")

    recommendations.sort(key=lambda x: x[1], reverse=True)

    # Print top N recommendations
    print(f"\nTop {top_n} recommended dramas for {user_name}:")
    for i, (drama, rating) in enumerate(recommendations[:top_n]):
        print(f"{i+1}. {drama} - Predicted Rating: {rating:.2f}")

    return recommendations[:top_n]


user_name = "Rima-chan"
top_recommendations = get_top_recommendations(user_name, fm_model, user_encoder, title_encoder, onehot, df)


PCA analysis: how users are grouped

In [None]:
# Extract learned feature interaction matrix V
V_learned = fm_model.V.detach().cpu().numpy()  # Shape: [n_features, k, n_outputs]

# Choose a specific subrating to analyze (0 = "story", 1 = "acting", etc.)
output_dim = 0
V_selected = V_learned[:, :, output_dim]  # Shape: [n_features, k]

# Define user and item embeddings
num_users = len(user_encoder.classes_)
num_items = len(title_encoder.classes_)

V_users = V_selected[:num_users, :]  # Extract user embeddings
V_items = V_selected[num_users:num_users + num_items, :]  # Extract item embeddings


In [None]:
# Reduce user embeddings to 2D for visualization
pca_users = PCA(n_components=2)
V_users_pca = pca_users.fit_transform(V_users)  # Shape: [num_users, 2]

# Scatter plot of users in PCA space
plt.figure(figsize=(20, 25))
plt.scatter(V_users_pca[:, 0], V_users_pca[:, 1], s=10, alpha=0.7, edgecolors='k')

plt.title("PCA Projection of User Embeddings")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid()
plt.show()


PCA projection: item embeddings
Groups dramas based on how users rated them

In [None]:
# Reduce item embeddings to 2D
pca_items = PCA(n_components=2)
V_items_pca = pca_items.fit_transform(V_items)

# Scatter plot of items in PCA space
plt.figure(figsize=(20, 25))
plt.scatter(V_items_pca[:, 0], V_items_pca[:, 1], s = 10, alpha=0.7, edgecolors='k')

plt.title("PCA Projection of Item Embeddings")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid()
plt.show()


In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Normalize and handle NaN values
V_selected = np.nan_to_num(V_selected)
V_selected = StandardScaler().fit_transform(V_selected)

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(V_selected, cmap="coolwarm", annot=False, linewidths=0.5, robust=True)
plt.title("Heatmap of Factorization Machine Embeddings (V) for 'Story' Subrating")
plt.xlabel("Latent Factors (k)")
plt.ylabel("Feature Index (Users + Items)")
plt.show()
