* [架構](https://i.imgur.com/umeIrJr.png)
* [AUC](https://i.imgur.com/cXD1WYu.png)
* [優缺點1](https://i.imgur.com/wiQKU5W.png)
* [優缺點2](https://i.imgur.com/1UaPqK3.png)

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import KFold 
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing.data import OneHotEncoder



In [2]:
user_item_path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/user_movie.dat'
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Feature

* X: [n_interaction, n_feature]：one hot for user, item, multi-hot for user_feature & item_feature
* [GBDT for cross-feature](https://i.imgur.com/0deDZsX.png) 
  
    GBDT應用概念：將X(Feature)作為input，利用GBDT產生對應的cross-feautre(樹中每個節點都是單個feauture)，利用GBDT中boosting的概念，強化對於錯誤的學習，找出更能代表data的cross-feature，將所有DT的結果(每個都產出對於error的預測)做one hot concat後得到最終embedding，再利用LR(Logistic Regression)進行分類預測，所以GBDT扮演Pretrain的角色。
* n_CrossFeature = n_trees
* 減少data imbalance：negative dowan sampling 
    
    問題：
    1. 使否將原始feature與cross-feauture做concat？
    2. 決定n_tree, tree depth
    3. 將power law & non power law feature各自分開train(各配GBDT)

In [3]:
def get_feature(path):
  names = ['id', 'feature_id']
  df = pd.read_csv(path, sep= '\t', names= names)
  n = int(df['id'].max())
  n_feature = int(df['feature_id'].max())
  feature_mat = np.zeros(shape= (n, n_feature), dtype= float)
  for i, row in df.iterrows():
    feature_mat[int(row['id'])-1, int(row['feature_id'])-1] = 1 
  return feature_mat

In [4]:
item_feature_mats = []
user_feature_mats = []
folder = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Movielens/'
for file in ['movie_genre', 'movie_movie(knn)']:
  path = folder + file + '.dat'
  item_feature_mat = get_feature(path= path)
  item_feature_mats.append(item_feature_mat)
for file in ['user_age', 'user_occupation']:
  path = folder + file + '.dat'
  user_feature_mat = get_feature(path= path)
  user_feature_mats.append(user_feature_mat)

In [5]:
item_feature_mat = np.concatenate(item_feature_mats, axis= 1)
user_feature_mat = np.concatenate(user_feature_mats, axis= 1)
print(f'item feature mat: {item_feature_mat.shape}')
print(f'user feature mat: {user_feature_mat.shape}')
n_user = user_feature_mat.shape[0]
n_item = item_feature_mat.shape[0]
d = n_item + item_feature_mat.shape[1] + n_user + user_feature_mat.shape[1]
print(f'd: {d}')

item feature mat: (1682, 19)
user feature mat: (943, 29)
d: 2673


In [6]:
rows = []
y= []
with open(user_item_path, 'r') as f:
  for line in f.readlines():
    user_temp = np.zeros(shape= (1, n_user), dtype= float)
    item_temp = np.zeros(shape= (1, n_item), dtype= float)
    user_id, item_id, rating, _= line.strip().split('\t')
    user_temp[0,int(user_id)-1] = 1
    item_temp[0,int(item_id)-1] = 1
    row = np.concatenate([user_temp, item_temp,  np.expand_dims(user_feature_mat[int(user_id)-1], axis= 0), np.expand_dims(item_feature_mat[int(item_id)-1], axis= 0)], axis= 1)
    rows.append(row)
    y.append(int(rating))

X = np.concatenate(rows, axis= 0)
y = np.array(y)
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'n_rating: {len(set(y))}')

X shape: (100000, 2673)
y shape: (100000,)
n_rating: 5


# Model

In [7]:
class GBDT_LR():
  def __init__(self, x_train, y_train, x_test, y_test, n_estimator, depth, max_iter):
    self.x_train = x_train 
    self.y_train = y_train
    self.x_test = x_test 
    self.y_test = y_test
    self.n_estimator = n_estimator
    self.depth = depth 
    self.max_iter = max_iter
  
  def GBDT(self):
    gbdt = GradientBoostingClassifier(n_estimators= self.n_estimator, max_depth= self.depth)
    gbdt.fit(self.x_train, self.y_train)
    
    """X：{array-like, sparse matrix} of shape (n_samples, n_features)"""
    output = gbdt.apply(X= self.x_train) # Shape: [n_interaction, n_estimator, n_class]

    """One Hot Encoding"""
    encoder = OneHotEncoder().fit(output[:, :, 0])
    embedding = encoder.transform(output[:, :, 0])
    return gbdt, encoder, embedding

  def LR(self, enc_x, y):
    lr = LogisticRegression(max_iter= self.max_iter)
    lr.fit(enc_x, y)
    return lr

  def train(self):
    gbdt, encoder, enc_x = self.GBDT()
    lr = self.LR(enc_x= enc_x, y= self.y_train)
    return gbdt, encoder, lr 

  def test(self):
    gbdt, encoder, lr = self.train()
    gbdt_output= gbdt.apply(X= self.x_test)
    # encoder = OneHotEncoder().fit(gbdt_output[:, :, 0])
    embedding = encoder.transform(gbdt_output[:, :, 0]) 
    pro = lr.predict_proba(embedding)
    return pro
    


# Training Stage

In [8]:
kf = KFold(n_splits=5)
RMSEs = []
n_estimator= 20
depth= 5
max_iter = 500

In [None]:
for train_indices, test_indices in kf.split(X):
  x_train, y_train = X[train_indices], y[train_indices]
  x_test, y_test = X[test_indices], y[test_indices]
  
  model = GBDT_LR(x_train= x_train, y_train= y_train, x_test=x_test, y_test= y_test, n_estimator=n_estimator, depth=depth, max_iter= max_iter)
  test_prob = model.test()
  test_rating = np.argmax(test_prob, axis=1)+1
  break

In [None]:
print(test_prob.shape)

In [None]:
# np.argmax(test_prob, axis=1)+1

In [None]:
test_rating