In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import KFold 
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
# import xgboost as xgb
from xgboost import XGBClassifier, DMatrix

In [None]:
user_item_path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/user_movie.dat'

In [None]:
def get_feature(path):
  names = ['id', 'feature_id']
  df = pd.read_csv(path, sep= '\t', names= names)
  n = int(df['id'].max())
  n_feature = int(df['feature_id'].max())
  feature_mat = np.zeros(shape= (n, n_feature), dtype= float)
  for i, row in df.iterrows():
    feature_mat[int(row['id'])-1, int(row['feature_id'])-1] = 1 
  return feature_mat

In [None]:
item_feature_mats = []
user_feature_mats = []
folder = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/'
for file in ['movie_genre']:
  path = folder + file + '.dat'
  item_feature_mat = get_feature(path= path)
  item_feature_mats.append(item_feature_mat)
for file in ['user_age', 'user_occupation']:
  path = folder + file + '.dat'
  user_feature_mat = get_feature(path= path)
  user_feature_mats.append(user_feature_mat)

In [None]:
item_feature_mat = np.concatenate(item_feature_mats, axis= 1)
user_feature_mat = np.concatenate(user_feature_mats, axis= 1)
print(f'item feature mat: {item_feature_mat.shape}')
print(f'user feature mat: {user_feature_mat.shape}')
n_user = user_feature_mat.shape[0]
n_item = item_feature_mat.shape[0]
d = n_item + item_feature_mat.shape[1] + n_user + user_feature_mat.shape[1]
print(f'd: {d}')

item feature mat: (1682, 18)
user feature mat: (943, 29)
d: 2672


In [None]:
rows = []
y= []
with open(user_item_path, 'r') as f:
  for line in f.readlines():
    user_temp = np.zeros(shape= (1, n_user), dtype= float)
    item_temp = np.zeros(shape= (1, n_item), dtype= float)
    user_id, item_id, rating, _= line.strip().split('\t')
    user_temp[0,int(user_id)-1] = 1
    item_temp[0,int(item_id)-1] = 1
    # row = np.concatenate([user_temp, item_temp,  np.expand_dims(user_feature_mat[int(user_id)-1], axis= 0), np.expand_dims(item_feature_mat[int(item_id)-1], axis= 0)], axis= 1)
    row = np.concatenate([np.expand_dims(user_feature_mat[int(user_id)-1], axis= 0), np.expand_dims(item_feature_mat[int(item_id)-1], axis= 0)], axis= 1)
    rows.append(row)
    y.append(int(rating))

X = np.concatenate(rows, axis= 0)
y = np.array(y)
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'n_rating: {len(set(y))}')
print(y)

X shape: (100000, 47)
y shape: (100000,)
n_rating: 5
[3 3 1 ... 1 2 3]


In [None]:
class XGB_LR():
  def __init__(self, x_train, y_train, x_test, y_test, n_estimator, depth, max_iter):
    self.x_train = x_train 
    self.y_train = y_train
    self.x_test = x_test 
    self.y_test = y_test
    self.n_estimator = n_estimator
    self.depth = depth 
    self.max_iter = max_iter
  
  def XGB(self):
    xgb = XGBClassifier(n_estimators= self.n_estimator, max_depth= self.depth, n_jobs= -1, objective= 'multi:softmax', num_class= 5, booster= 'gbtree', random_state= 42)
    xgb.fit(self.x_train, self.y_train)
    
    """X：{array-like, sparse matrix} of shape (n_samples, n_features)"""
    output = xgb.apply(self.x_train) # Shape: [n_interaction, n_estimator * n_class]
    print(output.shape)
    # print(output)

    """One Hot Encoding"""
    encoder = OneHotEncoder().fit(output)
    embedding = encoder.transform(output)
    # print(embedding.shape)
    # print(embedding)
    return xgb, encoder, embedding

  def LR(self, enc_x, y):
    lr = LogisticRegression(max_iter= self.max_iter, n_jobs= -1, random_state= 42)
    lr.fit(enc_x, y)
    return lr

  def train(self):
    xgb, encoder, enc_x = self.XGB()
    lr = self.LR(enc_x= enc_x, y= self.y_train)
    return xgb, encoder, lr 

  def test(self):
    xgb, encoder, lr = self.train()
    xgb_output= xgb.apply(X= self.x_test)
    embedding = encoder.transform(xgb_output) 
    pro = lr.predict_proba(embedding)
    return pro

In [None]:
kf = KFold(n_splits=5)
RMSEs = []
n_estimator= 10
depth= 5
max_iter = 500

In [None]:
RMSEs= []
for train_indices, test_indices in kf.split(X):
  x_train, y_train = X[train_indices], y[train_indices]
  x_test, y_test = X[test_indices], y[test_indices]
  model = XGB_LR(x_train= x_train, y_train= y_train, x_test=x_test, y_test= y_test, n_estimator=n_estimator, depth=depth, max_iter= max_iter)
  test_prob = model.test()
  test_rating = np.argmax(test_prob, axis=1)+1
  rmse = mean_squared_error(y_test, test_rating, squared=False)
  RMSEs.append(rmse)
print(f"avg rmse: {np.mean(RMSEs)}")

(80000, 50)
(80000, 50)
(80000, 50)
(80000, 50)
(80000, 50)
avg rmse: 1.2022092414585606


In [None]:
test_rating[:20]

array([4, 4, 4, 4, 4, 5, 3, 4, 4, 4, 4, 4, 4, 3, 4, 3, 3, 3, 4, 4])

In [None]:
y_test

array([4, 1, 1, ..., 1, 2, 3])