In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import KFold 
from sklearn.preprocessing import OneHotEncoder
import torch 
import torch.nn as nn
torch.manual_seed(0)

<torch._C.Generator at 0x7f3ef98a0f70>

# Feature

In [None]:
user_item_path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/user_movie.dat'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def get_feature(path):
  names = ['id', 'feature_id']
  df = pd.read_csv(path, sep= '\t', names= names)
  n = int(df['id'].max())
  n_feature = int(df['feature_id'].max())
  feature_mat = torch.zeros(size= (n, n_feature), dtype= torch.float, device= device)
  for i, row in df.iterrows():
    feature_mat[int(row['id'])-1, int(row['feature_id'])-1] = 1 
  return feature_mat

In [None]:
item_feature_mats = []
user_feature_mats = []
folder = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/'
for file in ['movie_genre']:
  path = folder + file + '.dat'
  item_feature_mat = get_feature(path= path)
  item_feature_mats.append(item_feature_mat)
for file in ['user_age', 'user_occupation']:
  path = folder + file + '.dat'
  user_feature_mat = get_feature(path= path)
  user_feature_mats.append(user_feature_mat)

In [None]:
item_feature_mat = torch.cat(item_feature_mats, dim= 1)
user_feature_mat = torch.cat(user_feature_mats, dim= 1)
print(f'item feature mat: {item_feature_mat.shape}')
print(f'user feature mat: {user_feature_mat.shape}')
n_user = user_feature_mat.shape[0]
n_item = item_feature_mat.shape[0]
d = n_item + item_feature_mat.shape[1] + n_user + user_feature_mat.shape[1]
print(f'd: {d}')

item_feature_len = [mat.shape[1] for mat in item_feature_mats]
user_feature_len = [mat.shape[1] for mat in user_feature_mats]

item feature mat: torch.Size([1682, 18])
user feature mat: torch.Size([943, 29])
d: 2672


In [None]:
rows = []
y= []
with open(user_item_path, 'r') as f:
  for line in f.readlines():
    user_temp = torch.zeros(size= (1, n_user), dtype= torch.float, device= device)
    item_temp = torch.zeros(size= (1, n_item), dtype= torch.float, device= device)
    user_id, item_id, rating, _= line.strip().split('\t')
    user_temp[0,int(user_id)-1] = 1
    item_temp[0,int(item_id)-1] = 1
    row = torch.cat([user_temp, item_temp, user_feature_mat[int(user_id)-1].unsqueeze(dim= 0), item_feature_mat[int(item_id)-1].unsqueeze(dim= 0)], dim= 1)
    rows.append(row)
    y.append(int(rating))

X = torch.cat(rows, dim= 0)
y = torch.tensor(y, dtype=torch.float)
# encoder = OneHotEncoder(sparse= False)
# y_onehot = encoder.fit_transform(y.view(-1,1))
# y_onehot = torch.tensor(y_onehot, dtype=torch.float) # tensor

# Utils

In [None]:
class Dense_Embedding(nn.Module):
  def __init__(self, fields, D):
    super(Dense_Embedding, self).__init__()
    self.fields= fields 
    self.embedding_ws = nn.ParameterList([nn.Parameter(torch.randn(size= (i, D), dtype=torch.float, device= device)) for i in fields])

  def forward(self, X): #[batch_size, d]
    es = []
    start= 0
    for i, field in enumerate(self.fields):
      # ei = self.embedding_ws[i](X[:, start:start+field]).unsqueeze(dim= 1) # ei: [n, 1, D]
      ei = torch.matmul(X[:, start:start+field], self.embedding_ws[i]).unsqueeze(dim= 1) # ei: [n, 1, D]
      start += field
      es.append(ei)
    return torch.cat(es, dim= 1) # [n, n_fields, D]  

In [None]:
"""
  Input shape
    - 3D tensor with shape: ``(batch_size, field_size, embedding_size)``.
  Output shape
    - 2D tensor with shape: ``(batch_size, featuremap_num)`` ``featuremap_num = sum(layer_size)`` .
  Arguments
    - **field_size** : Positive integer, number of feature groups.
    - **layer_size** : list of int.Feature maps in each layer.
    - **activation** : activation function name used on feature maps.
    - **split_half** : bool.if set to False, half of the feature maps in each hidden will connect to output unit.
    - **seed** : A Python integer to use as random seed.
"""

class CIN(nn.Module):
  def __init__(self, field_size, layer_size=(128, 128)):
    super(CIN, self).__init__()
    self.layer_size = layer_size
    self.field_nums = [field_size]
    self.activation = nn.ReLU()

    self.conv1ds = nn.ModuleList()
    for i, size in enumerate(self.layer_size):
      self.conv1ds.append(nn.Conv1d(in_channels= self.field_nums[-1] * self.field_nums[0], out_channels= size, kernel_size= 1))
      self.field_nums.append(size)

  def forward(self, inputs):
    batch_size = inputs.shape[0]
    dim = inputs.shape[-1]
    hidden_nn_layers = [inputs]
    final_result = []

    for i, size in enumerate(self.layer_size):
      # x^(k-1) * x^0
      x = torch.einsum('bhd,bmd->bhmd', hidden_nn_layers[-1], hidden_nn_layers[0])
      # x.shape = (batch_size , hi * m, dim)
      x = x.reshape(batch_size, hidden_nn_layers[-1].shape[1] * hidden_nn_layers[0].shape[1], dim)
      # x.shape = (batch_size , hi, dim)
      x = self.conv1ds[i](x)

      if self.activation is None or self.activation == 'linear':
        curr_out = x
      else:
        curr_out = self.activation(x)

      direct_connect = curr_out
      next_hidden = curr_out

      final_result.append(direct_connect)
      hidden_nn_layers.append(next_hidden)

    result = torch.cat(final_result, dim=1) # [batch, sum(hi), dim]
    result = torch.sum(result, -1)
    return result 

# 模型

In [None]:
class xDeepFM(nn.Module):
  def __init__(self, fields, D= 10, layer_size= (128, 128), hidden_dims= (128, 128), n_class= 1, dropout= 0.3): # fields as list 
    super(xDeepFM, self).__init__()
    """dense embedding"""
    self.Dense_Embedding= Dense_Embedding(fields= fields, D= D)
    """mean part"""
    self.b = nn.Parameter(torch.zeros(size= (1, ), dtype= torch.float))
    """linear part"""
    self.linear = nn.Linear(sum(fields), n_class, bias= False)
    """CIN part"""
    self.CIN= CIN(field_size= len(fields), layer_size= layer_size)
    self.fc= nn.Linear(sum(layer_size), n_class, bias= False)
    self.fc_activation= nn.ReLU()
    """DNN part"""

    layers = []
    input_dim = D * len(fields)

    for hidden_dim in hidden_dims:
      layers.append(nn.Linear(input_dim, hidden_dim))
      layers.append(nn.BatchNorm1d(hidden_dim))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(p=dropout))
      input_dim = hidden_dim
    
    layers.append(nn.Linear(hidden_dims[-1], n_class))
    self.dnn = nn.Sequential(*layers)

  def forward(self, X): # X: [batch_size, d]
    
    dense_X = self.Dense_Embedding(X) #[batch_size, n_fields, D]
    y_linear = self.linear(X)
    cin_out = self.CIN(dense_X)
    y_cin = self.fc(cin_out)
    y_dnn = self.dnn(dense_X.view(-1, dense_X.size()[-1]*dense_X.size()[-2]))
    y = y_linear + y_cin + y_dnn + self.b
    return self.fc_activation(y)

# Training

In [None]:
fields = [n_user, n_item] + user_feature_len + item_feature_len
print(fields)

D = 10
hidden_dims = [128, 128]
lr = 1e-2
# n_epoch = 100
p = 0

test_kf = KFold(n_splits=5, shuffle= True, random_state=42)
val_kf = KFold(n_splits=8, shuffle= True, random_state=42)

model = xDeepFM(fields= fields, layer_size= hidden_dims, hidden_dims= hidden_dims, dropout= p).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= lr)
criterion = nn.MSELoss()

[943, 1682, 8, 21, 18]


In [None]:
# for epoch in range(n_epoch):
RMSEs = []
for rest_indice, test_indice in test_kf.split(X):
  rest_X = X[rest_indice]
  test_X = X[test_indice]
  for train_indice, val_indice in val_kf.split(rest_X):
    train_X, val_X = rest_X[train_indice], rest_X[val_indice]
    '''training process'''
    model.train()
    optimizer.zero_grad()
    output = model(X= train_X)
    loss = criterion(output.squeeze(dim= 1).cpu(), y[train_indice])
    loss.backward()
    optimizer.step()
    out = output.squeeze(dim= 1).detach().cpu()
    rmse = mean_squared_error(out, y[train_indice], squared= False)

    '''testing process'''
    model.eval()
    with torch.no_grad():
      output = model(X= test_X)
      out = output.squeeze(dim= 1).detach().cpu()
      rmse = mean_squared_error(out, y[test_indice], squared= False)
      RMSEs.append(rmse)  

print(f'avg RMSEs: {round(np.mean(RMSEs), 2)}')

avg RMSEs: 3.990000009536743
