In [37]:
# !pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0  --extra-index-url https://download.pytorch.org/whl/cu126
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm.rich import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Load Dataset

In [39]:
movies_df = pd.read_csv("./movies.dat", 
                        header = None, 
                        sep='::', 
                        names=['item', 'title', 'genre'],
                        engine='python',
                        encoding='latin-1')
movies_df.head()

Unnamed: 0,item,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
movies_df['year'] = movies_df['title'].apply(lambda x: int(x[-5: -1]))
movies_df['name'] = movies_df['title'].apply(lambda x: x[: -7])
movies_df.drop(columns=['title'], inplace=True)
movies_df.head()

Unnamed: 0,item,genre,year,name
0,1,Animation|Children's|Comedy,1995,Toy Story
1,2,Adventure|Children's|Fantasy,1995,Jumanji
2,3,Comedy|Romance,1995,Grumpier Old Men
3,4,Comedy|Drama,1995,Waiting to Exhale
4,5,Comedy,1995,Father of the Bride Part II


In [41]:
genres_raw  = movies_df['genre'].str.split('|', expand=True)
list_genres = []
for i in range(6):
    list_genres += genres_raw[i].tolist()
set_genres = set(list_genres)
set_genres.remove(None)
set_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [42]:
for genre in set_genres:
    movies_df[genre] = movies_df['genre'].apply(lambda x: 1 if genre in x else 0)
movies_df.drop(columns=['genre'], inplace=True)
movies_df.head()

Unnamed: 0,item,year,name,Adventure,Comedy,Fantasy,Sci-Fi,Drama,Children's,Crime,...,Mystery,Western,Film-Noir,Musical,Romance,Action,War,Thriller,Documentary,Animation
0,1,1995,Toy Story,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1995,Jumanji,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,Grumpier Old Men,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,1995,Waiting to Exhale,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1995,Father of the Bride Part II,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
ratings_df = pd.read_csv("./ratings.dat", 
                        header = None, 
                        sep='::', 
                        names=['user', 'item', 'label', 'timestamp'],
                        engine='python',
                        encoding='latin-1')
ratings_df.head()

Unnamed: 0,user,item,label,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [44]:
users_df = pd.read_csv("./users.dat", 
                       sep='::', 
                       header=None, 
                       names=['user', 'gender', 'age', 'occupation', 'zipcode'], 
                       engine='python', 
                       encoding='latin-1')
users_df.head()

Unnamed: 0,user,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [45]:
colab_df = pd.merge(ratings_df, users_df, on='user', how='inner')
colab_df

Unnamed: 0,user,item,label,timestamp,gender,age,occupation,zipcode
0,1,1193,5,978300760,F,1,10,48067
1,1,661,3,978302109,F,1,10,48067
2,1,914,3,978301968,F,1,10,48067
3,1,3408,4,978300275,F,1,10,48067
4,1,2355,5,978824291,F,1,10,48067
...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,25,6,11106
1000205,6040,1094,5,956704887,M,25,6,11106
1000206,6040,562,5,956704746,M,25,6,11106
1000207,6040,1096,4,956715648,M,25,6,11106


In [46]:
merged_df = pd.merge(colab_df, movies_df, on='item', how='inner')
merged_df 

Unnamed: 0,user,item,label,timestamp,gender,age,occupation,zipcode,year,name,...,Mystery,Western,Film-Noir,Musical,Romance,Action,War,Thriller,Documentary,Animation
0,1,1193,5,978300760,F,1,10,48067,1975,One Flew Over the Cuckoo's Nest,...,0,0,0,0,0,0,0,0,0,0
1,2,1193,5,978298413,M,56,16,70072,1975,One Flew Over the Cuckoo's Nest,...,0,0,0,0,0,0,0,0,0,0
2,12,1193,4,978220179,M,25,12,32793,1975,One Flew Over the Cuckoo's Nest,...,0,0,0,0,0,0,0,0,0,0
3,15,1193,4,978199279,M,25,7,22903,1975,One Flew Over the Cuckoo's Nest,...,0,0,0,0,0,0,0,0,0,0
4,17,1193,5,978158471,M,50,1,95350,1975,One Flew Over the Cuckoo's Nest,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,1998,Modulations,...,0,0,0,0,0,0,0,0,1,0
1000205,5675,2703,3,976029116,M,35,14,30030,1998,Broken Vessels,...,0,0,0,0,0,0,0,0,0,0
1000206,5780,2845,1,958153068,M,18,17,92886,1999,White Boys,...,0,0,0,0,0,0,0,0,0,0
1000207,5851,3607,5,957756608,F,18,20,55410,1973,One Little Indian,...,0,1,0,0,0,0,0,0,0,0


In [47]:
merged_df.columns

Index(['user', 'item', 'label', 'timestamp', 'gender', 'age', 'occupation',
       'zipcode', 'year', 'name', 'Adventure', 'Comedy', 'Fantasy', 'Sci-Fi',
       'Drama', 'Children's', 'Crime', 'Horror', 'Mystery', 'Western',
       'Film-Noir', 'Musical', 'Romance', 'Action', 'War', 'Thriller',
       'Documentary', 'Animation'],
      dtype='object')

In [48]:
list(set_genres)

['Adventure',
 'Comedy',
 'Fantasy',
 'Sci-Fi',
 'Drama',
 "Children's",
 'Crime',
 'Horror',
 'Mystery',
 'Western',
 'Film-Noir',
 'Musical',
 'Romance',
 'Action',
 'War',
 'Thriller',
 'Documentary',
 'Animation']

In [49]:
merged_df['label'].max()

5

# Model Definition

# Training

In [52]:
df = merged_df
wide_features = [] + list(set_genres)
deep_features = ['user', 'item', 'gender', 'age', 'occupation',
       'zipcode', 'year', 'name']

print(wide_features, deep_features)

['Adventure', 'Comedy', 'Fantasy', 'Sci-Fi', 'Drama', "Children's", 'Crime', 'Horror', 'Mystery', 'Western', 'Film-Noir', 'Musical', 'Romance', 'Action', 'War', 'Thriller', 'Documentary', 'Animation'] ['user', 'item', 'gender', 'age', 'occupation', 'zipcode', 'year', 'name']


In [53]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device
df = merged_df
wide_features = [] 
deep_features = ['user', 'item', 'gender', 'age', 'occupation', 'zipcode', 'year', 'name'] + list(set_genres)

# ── 2) Split out X_wide, X_deep and target y ──
X_wide = df[wide_features].copy()
X_deep = df[deep_features].copy()
y      = df["label"].copy()

# ── 3) Merge into one DataFrame to oversample ──
data = pd.concat([X_wide, X_deep, y.rename("target")], axis=1)

# # Oversample minority classes to match the largest class
# max_n = data["target"].value_counts().max()
# data_balanced = pd.concat([
#     resample(sub, replace=True, n_samples=max_n, random_state=42)
#     for _, sub in data.groupby("target")
# ], ignore_index=True)

# X_wide = data_balanced[wide_features]
# X_deep = data_balanced[deep_features]
# y      = data_balanced["target"]

# ── 4) Preprocess “deep” features: rare-category grouping & factorization ──
min_freq = 20
for col in X_deep:
    freq    = X_deep[col].value_counts()
    rare_lbl= freq[freq < min_freq].index
    X_deep[col] = X_deep[col].replace(rare_lbl, "RARE")
    X_deep[col], _ = pd.factorize(X_deep[col])

# Calculate each embedding’s vocabulary size
field_dims = [int(X_deep[col].max()) + 1 for col in X_deep]

# ── 5) Encode target and split ──
y_enc, classes = pd.factorize(y)
num_classes = len(classes)

X_w_tr, X_w_te, X_d_tr, X_d_te, y_tr, y_te = train_test_split(
    X_wide.to_numpy(), X_deep.to_numpy(), y_enc,
    test_size=0.2, random_state=42, stratify=y_enc
)

# Clip any out-of-range factorized indices
for i in range(X_d_tr.shape[1]):
    X_d_tr[:,i] = np.clip(X_d_tr[:,i], 0, field_dims[i]-1)
    X_d_te[:,i] = np.clip(X_d_te[:,i], 0, field_dims[i]-1)

# ── 6) Convert to PyTorch tensors & DataLoaders ──
X_w_tr = torch.tensor(X_w_tr, dtype=torch.float32)
X_d_tr = torch.tensor(X_d_tr, dtype=torch.long)
y_tr   = torch.tensor(y_tr, dtype=torch.long)
X_w_te = torch.tensor(X_w_te, dtype=torch.float32)
X_d_te = torch.tensor(X_d_te, dtype=torch.long)
y_te   = torch.tensor(y_te, dtype=torch.long)

batch_size = 128
train_ds = TensorDataset(X_w_tr, X_d_tr, y_tr)
test_ds  = TensorDataset(X_w_te, X_d_te, y_te)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size)

# ── 7) Define the cDeepFM model ──
class cDeepFM(nn.Module):
    def __init__(
        self,
        field_dims,    # list of vocab sizes for each deep feature
        embed_dim,     # embedding dimension
        wide_dim,      # number of wide inputs
        num_classes,   # target classes
        deep_dims=[128,64],
        compression_dim=32
    ):
        super().__init__()
        # Embeddings for deep features
        self.embeddings = nn.ModuleList([
            nn.Embedding(voc_size, embed_dim)
            for voc_size in field_dims
        ])
        # FM linear part
        self.linear_fm = nn.Linear(len(field_dims)*embed_dim, 1)
        # Deep MLP
        layers = []
        input_dim = len(field_dims)*embed_dim
        for d in deep_dims:
            layers += [nn.Linear(input_dim, d), nn.ReLU()]
            input_dim = d
        self.deep_layers = nn.Sequential(*layers)
        # Compression
        total_dim = wide_dim + deep_dims[-1] + 1
        self.compression = nn.Sequential(
            nn.Linear(total_dim, compression_dim),
            nn.ReLU()
        )
        # Final classifier
        self.final = nn.Linear(compression_dim, num_classes)

    def forward(self, x_wide, x_deep):
        # Embedding lookup + concat
        emb_list = [emb(x_deep[:,i]) for i,emb in enumerate(self.embeddings)]
        x_emb = torch.cat(emb_list, dim=1)
        # FM output
        fm_out = self.linear_fm(x_emb)
        # Deep MLP output
        deep_out = self.deep_layers(x_emb)
        # Combine wide, deep, fm
        x_concat = torch.cat([x_wide, deep_out, fm_out], dim=1)
        x_comp   = self.compression(x_concat)
        return self.final(x_comp)

In [54]:
# class cDeepFM(nn.Module):
#     def __init__(
#         self,
#         field_dims,    # list of vocab sizes for each deep feature
#         embed_dim,     # embedding dimension
#         wide_dim,      # number of wide inputs
#         num_classes,   # target classes
#         deep_dims=[128,64],
#         compression_dim=32
#     ):
#         super().__init__()
#         # Embeddings for deep features
#         self.embeddings = nn.ModuleList([
#             nn.Embedding(voc_size, embed_dim)
#             for voc_size in field_dims
#         ])
#         # FM linear part
#         self.linear_fm = nn.Linear(len(field_dims)*embed_dim, 1)
#         # Deep MLP
#         layers = []
#         input_dim = len(field_dims)*embed_dim
#         for d in deep_dims:
#             layers += [nn.Linear(input_dim, d), nn.ReLU()]
#             input_dim = d
#         self.deep_layers = nn.Sequential(*layers)
#         # Compression
#         total_dim = wide_dim + deep_dims[-1] + 1
#         self.compression = nn.Sequential(
#             nn.Linear(total_dim, compression_dim),
#             nn.ReLU()
#         )
#         # Final classifier
#         self.final = nn.Linear(compression_dim, num_classes)

#     def forward(self, x_wide, x_deep):
#         # Embedding lookup + concat
#         emb_list = [emb(x_deep[:,i]) for i,emb in enumerate(self.embeddings)]
#         x_emb = torch.cat(emb_list, dim=1)
#         # FM output
#         fm_out = self.linear_fm(x_emb)
#         # Deep MLP output
#         deep_out = self.deep_layers(x_emb)
#         # Combine wide, deep, fm
#         x_concat = torch.cat([x_wide, deep_out, fm_out], dim=1)
#         x_comp   = self.compression(x_concat)
#         return self.final(x_comp)

# ── 8) Train & evaluate ──
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = cDeepFM(field_dims, embed_dim=8, wide_dim=len(wide_features), num_classes=num_classes)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
for epoch in range(1, 11):
    model.train()
    total_loss = 0
    for xw, xd, yb in train_loader:
        xw, xd, yb = xw.to(device), xd.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xw, xd)
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xw.size(0)
    print(f"Epoch {epoch} — Train Loss: {total_loss/len(train_ds):.4f}")

# Test accuracy
model.eval()
correct = 0
with torch.no_grad():
    for xw, xd, yb in test_loader:
        xw, xd, yb = xw.to(device), xd.to(device), yb.to(device)
        preds = model(xw, xd).argmax(dim=1)
        correct += (preds == yb).sum().item()
print(f"Test Accuracy: {correct/len(test_ds):.4f}")

Epoch 1 — Train Loss: 1.3316
Epoch 2 — Train Loss: 1.2457
Epoch 3 — Train Loss: 1.2217
Epoch 4 — Train Loss: 1.2088
Epoch 5 — Train Loss: 1.1995
Epoch 6 — Train Loss: 1.1918
Epoch 7 — Train Loss: 1.1849
Epoch 8 — Train Loss: 1.1788
Epoch 9 — Train Loss: 1.1727
Epoch 10 — Train Loss: 1.1672
Test Accuracy: 0.4648


In [55]:
# cDeepFM -> accuracy 0.4663 | ['user', 'item', 'gender', 'age', 'occupation', 'zipcode', 'year', 'name'] + list(set_genres)