In [1]:
import numpy as np
import torch 
import torch.nn as nn
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import KFold 

In [2]:
user_item_path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/user_movie.dat'

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Preprocessing

## Feature Vector
[計算](https://i.imgur.com/YEEtiSU.png)
[模型](https://i.imgur.com/C1YiB6Y.png)

* 注意filtering

In [4]:
# with open('/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/user_movie.dat', 'r') as f:
#   for line in f.readlines():
#     print(line)

In [5]:
def get_feature(path):
  names = ['id', 'feature_id']
  df = pd.read_csv(path, sep= '\t', names= names)
  n = int(df['id'].max())
  # print(f'n: {n}')
  n_feature = int(df['feature_id'].max())
  # print(n_feature)
  feature_mat = torch.zeros(size= (n, n_feature), dtype= float, device= device)
  for i, row in df.iterrows():
    feature_mat[int(row['id'])-1, int(row['feature_id'])-1] = 1 
  return feature_mat

In [6]:
item_feature_mats = []
user_feature_mats = []
folder = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/'
for file in ['movie_genre', 'movie_movie(knn)']:
  path = folder + file + '.dat'
  item_feature_mat = get_feature(path= path)
  item_feature_mats.append(item_feature_mat)
for file in ['user_age', 'user_occupation']:
  path = folder + file + '.dat'
  user_feature_mat = get_feature(path= path)
  user_feature_mats.append(user_feature_mat)

In [7]:
item_feature_mat = torch.cat(item_feature_mats, dim= 1)
user_feature_mat = torch.cat(user_feature_mats, dim= 1)
print(f'item feature mat: {item_feature_mat.shape}')
print(f'user feature mat: {user_feature_mat.shape}')
n_user = user_feature_mat.shape[0]
n_item = item_feature_mat.shape[0]
d = n_item + item_feature_mat.shape[1] + n_user + user_feature_mat.shape[1]
print(f'd: {d}')

item feature mat: torch.Size([1682, 19])
user feature mat: torch.Size([943, 29])
d: 2673


## 建立X

In [8]:
rows = []
y= []
with open(user_item_path, 'r') as f:
  for line in f.readlines():
    user_temp = torch.zeros(size= (1, n_user), dtype= float, device= device)
    item_temp = torch.zeros(size= (1, n_item), dtype= float, device= device)
    user_id, item_id, rating, _= line.strip().split('\t')
    user_temp[0,int(user_id)-1] = 1
    item_temp[0,int(item_id)-1] = 1
    row = torch.cat([user_temp, item_temp, user_feature_mat[int(user_id)-1].unsqueeze(dim= 0), item_feature_mat[int(item_id)-1].unsqueeze(dim= 0)], dim= 1)
    rows.append(row)
    y.append(int(rating))

X = torch.cat(rows, dim= 0)
y= torch.tensor(y, dtype=float) # tensor

In [9]:
print(f'X size: {X.size()}')

X size: torch.Size([100000, 2673])


## User item Matrix

In [10]:
class user_item_matrix():
  def __init__(self, path):
    names = ['user_id', 'item_id', 'rating']
    df = pd.read_csv(path, sep='\t', names=names)
    
    '''filtering'''
    grouped_df = df.groupby(['user_id'])['item_id'].count()
    filtered_user_id = grouped_df[grouped_df>3].index
    filtered_df = df.set_index('user_id').loc[filtered_user_id].reset_index()
    
    '''user_id to index_id'''
    self.user_dict = dict()
    for i, user_id in enumerate(filtered_user_id):
        self.user_dict[user_id] = i 
    
    self.n_user= filtered_df['user_id'].unique().shape[0]
    self.n_item = filtered_df['item_id'].unique().shape[0]
    self.data= [[self.user_dict[info[0]], info[1], info[2]] for info in np.array(filtered_df)]
    self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

  def k_fold_data_split(self):
    
    np.random.shuffle(self.data)
    train_data = self.data[:int(0.7*len(self.data))]
    val_data = self.data[int(0.7*len(self.data)):int(0.8*len(self.data))]
    test_data = self.data[int(0.8*len(self.data)):]
    return train_data, val_data, test_data

  def get_user_item_matrix(self):
    
    train_data, val_data, test_data= self.k_fold_data_split()
    matrix = torch.zeros((self.n_user, self.n_item), dtype= float, device= self.device)
    test_matrix = torch.zeros((self.n_user, self.n_item), dtype= float, device= self.device)
    # mask_matrix = torch.zeros((self.n_user, self.n_item), dtype= float).to(self.device)
    for info in train_data:
      matrix[info[0], info[1]-1] = info[2]
    for info in test_data:
      test_matrix[info[0], info[1]-1] = info[2]
    return matrix, test_matrix

# Factorization Machine 

* Generalization表現應較好

[Discussion](https://i.imgur.com/igROGcx.png)

In [11]:
class FM(nn.Module):
  def __init__(self, d, k):
    super(FM, self).__init__()
    """
    d: n_features
    k: n_latent features 
    """
    """params"""
    self.w0 = nn.Parameter(torch.zeros(size= (1,1), dtype= float, device= device))
    self.w = nn.Parameter(torch.zeros(size= (d, 1), dtype= float, device= device))
    self.v = nn.Parameter(torch.randn(size= (d, k), dtype= float, device= device))
    # self.params = nn.ParameterList([self.w0, self.w, self.v])

  def forward(self, X):
    '''X: [n, d]'''
    square_of_sum = (torch.matmul(X, self.v))**2  # [n, k]
    sum_of_square = torch.matmul(X**2, self.v**2) # [n, k]
    y = self.w0 + torch.matmul(X, self.w) + 0.5*(torch.sum((square_of_sum-sum_of_square), dim= 1).unsqueeze(dim= 1))
    return y 


In [12]:
model = FM(d= d, k= 3)

In [13]:
# model(X= X)

# Data Preparision & Training
* 5 fold cross validation 

In [14]:
kf = KFold(n_splits=5)
RMSEs = []

In [15]:
lr = 5e-2
n_epoch = 150
optimizer = torch.optim.SGD(model.parameters(), lr= lr)
criterion = nn.MSELoss()

In [16]:
for epoch in range(n_epoch):
  for train_indice, test_indice in kf.split(X):
    a = train_indice
    train_X, test_X = X[train_indice], X[test_indice]
    '''training process'''
    model.train()
    optimizer.zero_grad()
    output = model(X= train_X)
    loss = criterion(output.squeeze(dim= 1).cpu(), y[train_indice])
    loss.backward()
    optimizer.step()
    rmse = mean_squared_error(output.squeeze(dim= 1).detach().cpu(), y[train_indice], squared= False)
    # print(f'training rmse: {round(rmse,2)}')

    '''testing process'''
    model.eval()
    with torch.no_grad():
      output = model(X= test_X)
      rmse = mean_squared_error(output.squeeze(dim= 1).cpu(), y[test_indice], squared= False)
      # print(f'test rmse: {round(rmse, 2)}')
      RMSEs.append(rmse)  

  if ((epoch+1)% 10) == 0:
    print(f'epoch: {epoch+1}')
    print(f'avg RMSEs: {round(np.mean(RMSEs), 2)}')

epoch: 10
avg RMSEs: 3.15
epoch: 20
avg RMSEs: 2.7
epoch: 30
avg RMSEs: 2.48
epoch: 40
avg RMSEs: 2.36
epoch: 50
avg RMSEs: 2.27
epoch: 60
avg RMSEs: 2.21
epoch: 70
avg RMSEs: 2.16
epoch: 80
avg RMSEs: 2.11
epoch: 90
avg RMSEs: 2.08
epoch: 100
avg RMSEs: 2.05
epoch: 110
avg RMSEs: 2.02
epoch: 120
avg RMSEs: 2.0
epoch: 130
avg RMSEs: 1.98
epoch: 140
avg RMSEs: 1.96
epoch: 150
avg RMSEs: 1.94


In [17]:
model(X= test_X)

tensor([[4.3780],
        [2.0370],
        [2.1454],
        ...,
        [2.6915],
        [2.9283],
        [4.3591]], device='cuda:0', dtype=torch.float64,
       grad_fn=<AddBackward0>)

In [18]:
y[test_indice]

tensor([4., 1., 1.,  ..., 1., 2., 3.], dtype=torch.float64)