In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [2]:
def read_data_ml100k():
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv('ml-100k/u.data', sep='\t', names=columns, engine='python')
    n_users = data.user_id.unique().shape[0]
    n_items = data.item_id.unique().shape[0]
    return data, n_users, n_items

In [3]:
def split_data_ml100k(data, n_users, n_items, split_mode='random', test_ratio=0.1):
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))

            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)

        for u in range(1, n_users+1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [True if x == 1
                else
                    False for x in np.random.uniform(0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]

    return train_data, test_data

In [4]:
def load_data_ml100k(data, n_users, n_items, feedback='explicit'):
    users, items, scores = [], [], [] # users와 items, scores를 저장해놓을 리스트
    inter = np.zeros((n_items, n_users)) if feedback == 'explicit' else {} # feedback 이 'explicit'이면 inter를 [n_items, n_users] shape의 배열로 만든다. 'implicit'이면 빈 딕셔너리로
    for line in data.itertuples(): # line-by-line으로 data를 불러와서
        user_index, item_index = int(line[1] -1), int(line[2] - 1) # 라인의 첫번째 값에서 1을 뺀게 user_index, 라인의 두번째 값에서 1을 뺀게 item_index로 저장
        score = int(line[3]) if feedback == 'explicit' else 1 # 'explicit'이면 점수자체를 저장하고, 'implicit'이면 점수는 필요없고, 보았다는 것만 필요하기에 1로 저장
        users.append(user_index)
        items.append(item_index)
        scores.append(score) # user_index, item_index, score 다 추가
        if feedback == 'implicit': # 'implicit'이면
            inter.setdefault(user_index, []).append(item_index) # 해당 user_index에 본 item_index들을 저장해놓는다. => 아마 보았다는 것이 중요하기에 그런듯
        else:
            inter[item_index, user_index] = score # 'explicit'이면 그냥 점수를 inter list의 해당 부분에 저장

    return users, items, scores, inter

In [25]:
def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit', test_ratio=0.1, batch_size=256):
    data, n_users, n_items = read_data_ml100k()
    train_data, test_data = split_data_ml100k(data, n_users, n_items, split_mode, test_ratio)
    train_u, train_i, train_r, _ = load_data_ml100k(train_data, n_users, n_items, feedback)
    test_u, test_i, test_r, _ = load_data_ml100k(test_data, n_users, n_items, feedback)
    train_set = tf.data.Dataset.from_tensor_slices((np.array(train_u), np.array(train_i), np.array(train_r)))
    test_set = tf.data.Dataset.from_tensor_slices((np.array(test_u), np.array(test_i), np.array(test_r)))
    train_set = train_set.shuffle(1000).batch(batch_size, drop_remainder=True)
    test_set = test_set.batch(batch_size, drop_remainder=True)
    return n_users, n_items, train_set, test_set

In [26]:
n_users, n_items, train_set, test_set = split_and_load_ml100k(test_ratio=0.1, batch_size=512)

In [7]:
print(f'n_users: {n_users}')
print(f'n_items: {n_items}')

n_users: 943
n_items: 1682


In [32]:
print(len(list(train_set.as_numpy_iterator())))

193


In [9]:
class MyModel(tf.keras.Model):
    def __init__(self, n_users, n_items, K):
        super(MyModel, self).__init__()
        self.P = tf.keras.layers.Embedding(input_dim=n_users, output_dim=K)
        self.Q = tf.keras.layers.Embedding(input_dim=n_items, output_dim=K)
        self.user_bias = tf.keras.layers.Embedding(n_users, 1)
        self.item_bias = tf.keras.layers.Embedding(n_items, 1)

    def call(self, user_id, item_id):
        P_u = self.P(user_id)
        Q_i = self.Q(item_id)
        b_u = self.user_bias(user_id)
        b_i = self.user_bias(item_id)
        Q_i = tf.transpose(Q_i, perm=[0, 2, 1])
        outputs = tf.matmul(P_u, Q_i) + np.squeeze(b_u) + np.squeeze(b_i)
        return outputs

#============== 여기부터 다시

In [47]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
ratings.drop('timestamp', axis=1, inplace=True)

columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movies = movies[['item_id', 'movie title']]


n_users = len(ratings['user_id'].unique())
n_items = len(movies['item_id'].unique())
rating_mat = np.zeros(shape= (n_users, n_items))

ratings = ratings.to_numpy()
users = []
items = []
ids = []
for u, i, r in ratings:
    rating_mat[u-1, i-1] = r
    users.append(u)
    items.append(i)
    ids.append()

In [50]:
rating_mat = tf.convert_to_tensor(rating_mat)

In [51]:
rating_mat

<tf.Tensor: shape=(943, 1682), dtype=float64, numpy=
array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])>

In [42]:
input_arr1 = np.random.randn(n_users)
input_arr2 = np.random.randn(n_items)
user = tf.keras.layers.Embedding(input_dim=n_users, output_dim=20)(input_arr1)
item = tf.keras.layers.Embedding(input_dim=n_items, output_dim=20)(input_arr2)


In [43]:
item = tf.transpose(item, perm=[1, 0])

In [44]:
outputs = tf.matmul(user, item)

In [45]:
outputs.shape

TensorShape([943, 1682])

In [54]:
mse_loss_fn = tf.keras.losses.MeanSquaredError()

In [56]:
loss = mse_loss_fn(rating_mat, outputs)

In [62]:
loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.86715966>

In [10]:
model = MyModel(n_users, n_items, 20)

In [13]:
model.compile(optimizer)

TypeError: Error converting shape to a TensorShape: Dimension value must be integer or None or have an __index__ method, got value '(943, 1682)' with type '<class 'tuple'>'.

## 위로 코드 써보기

---

In [None]:
passcolumns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movies = movies[['item_id', 'movie title']]
movies.head()

In [4]:
user_movie_data = pd.merge(ratings, movies, on='item_id')
user_movie_data.head()

Unnamed: 0,user_id,item_id,rating,movie title
0,196,242,3,Kolya (1996)
1,63,242,3,Kolya (1996)
2,226,242,5,Kolya (1996)
3,154,242,3,Kolya (1996)
4,306,242,5,Kolya (1996)


In [5]:
user_movie_rating = user_movie_data.pivot_table('rating', index='user_id', columns='movie title').fillna(0)

In [8]:
user_movie_rating.shape[0]

943

In [11]:
user_movie_rating.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [12]:
matrix = user_movie_rating.to_numpy()

In [14]:
matrix

array([[0., 0., 2., ..., 0., 4., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
K = 20
n_user = user_movie_rating.shape[0]
n_movie = user_movie_rating.shape[1]

In [10]:
W = tf.Variable(tf.random.truncated_normal([n_user, K], stddev=0.2, mean=0), name="users")
H = tf.Variable(tf.random.truncated_normal([K, n_movie], stddev=0.2, mean=0), name="items")

In [15]:
pred_matrix = tf.matmul(W, H)

In [30]:
pred_matrix = np.asarray(pred_matrix)

In [18]:
idx, jdx = matrix.nonzero()

In [31]:
ys, preds = [], []
for i, j in zip(idx, jdx):
    ys.append(matrix[i][j])
    preds.append(pred_matrix[i][j])

In [32]:
error = tf.keras.losses.mean_squared_error(ys, preds)
print(error)

tf.Tensor(13.74855, shape=(), dtype=float32)


In [65]:
u = tf.keras.layers.Input(shape=(1, ))
m = tf.keras.layers.Input(shape=(1, ))
K = 20
u_embedding = tf.keras.layers.Embedding(n_users, K, embeddings_regularizer=tf.keras.regularizers.l2(1e-9))(u)
m_embedding = tf.keras.layers.Embedding(n_items, K, embeddings_regularizer=tf.keras.regularizers.l2(1e-9))(m)

u_bias = tf.keras.layers.Embedding(n_users, 1, embeddings_regularizer=tf.keras.regularizers.l2(1e-9))(u)
m_bias = tf.keras.layers.Embedding(n_items, 1, embeddings_regularizer=tf.keras.regularizers.l2(1e-9))(m)
x = tf.keras.layers.Dot(axes=2)([u_embedding, m_embedding])

x = tf.keras.layers.Add()([x, u_bias, m_bias])
x = tf.keras.layers.Flatten()(x)

model = tf.keras.Model(inputs=[u, m], outputs=x)

In [66]:
model.compile(loss='mse',
              optimizer=tf.keras.optimizers.SGD(learning_rate=0.08,
                                                momentum=0.9),
              metrics=['mae'])

In [67]:
rating_mat

<tf.Tensor: shape=(943, 1682), dtype=float64, numpy=
array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])>

<tf.Tensor: shape=(), dtype=float64, numpy=3.0>