In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import KFold 
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier, XGBRegressor
import torch 
import torch.nn as nn



In [2]:
folder = ['Movielens/user_movie.dat', 'Yelp/user_business.dat']
i_data = 0
# user_item_path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/user_movie.dat'
user_item_path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/' + folder[i_data]
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
def get_feature(path):
  names = ['id', 'feature_id']
  df = pd.read_csv(path, sep= '\t', names= names)
  n = int(df['id'].max())
  n_feature = int(df['feature_id'].max())
  feature_mat = torch.zeros(size= (n, n_feature), dtype= torch.float, device= device)
  for i, row in df.iterrows():
    feature_mat[int(row['id'])-1, int(row['feature_id'])-1] = 1 
  return feature_mat

In [4]:
item_feature_mats = []
user_feature_mats = []
folder = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/'
for file in ['movie_genre']:
  path = folder + file + '.dat'
  item_feature_mat = get_feature(path= path)
  item_feature_mats.append(item_feature_mat)
for file in ['user_age', 'user_occupation']:
  path = folder + file + '.dat'
  user_feature_mat = get_feature(path= path)
  user_feature_mats.append(user_feature_mat)

In [5]:
item_feature_mat = torch.cat(item_feature_mats, dim= 1)
user_feature_mat = torch.cat(user_feature_mats, dim= 1)
print(f'item feature mat: {item_feature_mat.shape}')
print(f'user feature mat: {user_feature_mat.shape}')
n_user = user_feature_mat.shape[0]
n_item = item_feature_mat.shape[0]
d = n_item + item_feature_mat.shape[1] + n_user + user_feature_mat.shape[1]
print(f'd: {d}')

item_feature_len = [mat.shape[1] for mat in item_feature_mats]
user_feature_len = [mat.shape[1] for mat in user_feature_mats]

item feature mat: torch.Size([1682, 18])
user feature mat: torch.Size([943, 29])
d: 2672


In [6]:
rows = []
y= []
with open(user_item_path, 'r') as f:
  for line in f.readlines():
    user_temp = torch.zeros(size= (1, n_user), dtype= torch.float, device= device)
    item_temp = torch.zeros(size= (1, n_item), dtype= torch.float, device= device)
    if i_data == 0:
      user_id, item_id, rating, _= line.strip().split('\t')
    elif i_data == 1:
      user_id, item_id, rating = line.strip().split('\t')
    user_temp[0,int(user_id)-1] = 1
    item_temp[0,int(item_id)-1] = 1
    row = torch.cat([user_temp, item_temp, user_feature_mat[int(user_id)-1].unsqueeze(dim= 0), item_feature_mat[int(item_id)-1].unsqueeze(dim= 0)], dim= 1)
    rows.append(row)
    y.append(int(rating))

X = torch.cat(rows, dim= 0)
y = torch.tensor(y, dtype=torch.float)

# Model

In [7]:
class XGB_NN(nn.Module):
  def __init__(self, fields, XGB_dim= 10, k= 10, hidden_dims= [16, 16], dropout= 0, n_class= 1):
    super(XGB_NN, self).__init__()
    self.fields = fields 
    self.k = k 
    self.hidden_dims = hidden_dims

    """FM"""
    d = sum(fields)
    self.FM_w = nn.Linear(d, 1, bias= False)
    self.embedding_ws = nn.ModuleList()
    for i in fields:
      self.embedding_ws.append(nn.Linear(i, k, bias= False))
    
    """DNN"""
    layers = []
    input_dim = XGB_dim

    for hidden_dim in hidden_dims:
      layers.append(nn.Linear(input_dim, hidden_dim))
      layers.append(nn.BatchNorm1d(hidden_dim))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(p=dropout))
      input_dim = hidden_dim
    
    layers.append(nn.Linear(hidden_dims[-1], n_class))
    self.dnn = nn.Sequential(*layers)

  def Dense_Embedding(self, X):
    es = []
    start= 0
    for i, field in enumerate(self.fields):
      ei = self.embedding_ws[i](X[:, start:start+field]).unsqueeze(dim= 1) # ei: [n, 1, k]
      # ei = torch.matmul(X[:, start:start+field], self.embedding_ws[i]).unsqueeze(dim= 1) # ei: [n, 1, k]
      start += field
      es.append(ei)

    return torch.cat(es, dim= 1) # [n, n_fields, k]  

  
  def FM(self, X):

    sum_of_square = torch.sum(X, dim= 1)**2 #[n, k]
    square_of_sum = torch.sum(X**2, dim= 1)
    ix = sum_of_square - square_of_sum    
    return 0.5 * torch.sum(ix, dim= 1, keepdim= True)

  def XGB_DNN(self, XGB_embedding):
    X = self.dnn(XGB_embedding)
    return X
  
  def forward(self, X, XGB_embedding):

    dense_X = self.Dense_Embedding(X)
    FM_y = self.FM(dense_X)
    DNN_y = self.XGB_DNN(XGB_embedding)
    y = self.FM_w(X) + FM_y + DNN_y

    # return nn.Sigmoid()(y)
    return nn.ReLU()(y)

In [8]:
def XGB(x_train, y_train, n_estimator, depth):
    # xgb = XGBClassifier(n_estimators= n_estimator, max_depth= depth, n_jobs= -1, objective= 'multi:softmax', num_class= 5, booster= 'gbtree', random_state= 42)
    xgb= XGBRegressor(n_estimators= n_estimator, max_depth= depth, n_jobs= -1, andom_state= 42, objective= "reg:squarederror")
    xgb.fit(x_train, y_train)
    
    """X：{array-like, sparse matrix} of shape (n_samples, n_features)"""
    output = xgb.apply(x_train) # Shape: [n_interaction, n_estimator * n_class]

    """One Hot Encoding"""
    # encoder = OneHotEncoder(sparse=False).fit(output)
    # embedding = encoder.transform(output)
    # print(embedding.shape)
    # print(embedding)
    # return xgb, encoder, torch.tensor(embedding, dtype= torch.float)
    return xgb, torch.tensor(output, dtype= torch.float)

# Train

In [9]:
fields = [n_user, n_item] + user_feature_len + item_feature_len
print(fields)

k = 10
hidden_dims = [32, 32]
lr = 1e-2
p = 0
n_estimator= 10
depth= 5

test_kf = KFold(n_splits=5, shuffle= True, random_state=42)
val_kf = KFold(n_splits=8, shuffle= True, random_state=42)

model = XGB_NN(fields= fields, XGB_dim= n_estimator, k= k, hidden_dims= hidden_dims, dropout= p).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= lr)
criterion = nn.MSELoss()

[943, 1682, 8, 21, 18]


In [10]:
RMSEs = []
for rest_indice, test_indice in test_kf.split(X):
  rest_X, rest_y = X[rest_indice], y[rest_indice]
  test_X = X[test_indice]

  for train_indice, val_indice in val_kf.split(rest_X):
    train_X, val_X = rest_X[train_indice], rest_X[val_indice]
    train_y = rest_y[train_indice]

    # xgb, encoder, XGB_embedding = XGB(train_X, train_y, n_estimator= n_estimator, depth= depth)
    xgb, XGB_embedding = XGB(train_X, train_y, n_estimator= n_estimator, depth= depth)
    # print(XGB_embedding.shape)
    '''training process'''
    model.train()
    optimizer.zero_grad()
    output = model(X= train_X, XGB_embedding= XGB_embedding)
    loss = criterion(output.squeeze(dim= 1).cpu(), y[train_indice])
    loss.backward()
    optimizer.step()
    out = output.squeeze(dim= 1).detach().cpu()

  '''testing process'''
  model.eval()
  with torch.no_grad():
    # xgb_output= xgb.apply(X= test_X)
    # embedding = encoder.transform(xgb_output)
    embedding = xgb.apply(test_X)
    output = model(X= test_X, XGB_embedding= torch.tensor(embedding, dtype= torch.float))
    out = output.squeeze(dim= 1).detach().cpu()
    rmse = mean_squared_error(out, y[test_indice], squared= False)
    RMSEs.append(rmse)  

print(f'avg RMSEs: {round(np.mean(RMSEs), 2)}')

avg RMSEs: 1.5499999523162842
