In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [269]:
rating_data = pd.read_csv("../ml-100k/u.data", sep = "\t", header = None)
rating_data.columns = ["user_id", "item_id","rating","timestamp"]
item_data = pd.read_csv("../ml-100k/u.item", sep = "|", header = None, encoding='latin-1')
item_data.columns = ["movie_id", "movie_title", "release_date", "video_release_date",
              "IMDb_URL","unknown","Action","Adventure","Animation",
              "Children's","Comedy","Crime","Documentary" ,"Drama", "Fantasy",
              "Film-Noir" ,"Horror","Musical" , "Mystery","Romance", "Sci-Fi",
              "Thriller", "War", "Western" ]
item_data.fillna(value = {'release_date': "01-01-1900"},inplace = True)
user_data =  pd.read_csv("../ml-100k/u.user", sep = "|", header = None)
user_data.columns = ["user_id", "age", "gender", "occupation", "zip_code"]

item_data['release_year'] = pd.DatetimeIndex(item_data['release_date']).year
item_data['release_month'] = pd.DatetimeIndex(item_data['release_date']).month
item_data.drop(columns = ['movie_title', 'release_date', "video_release_date", "IMDb_URL"], inplace = True)
occupation_data = pd.read_csv("../ml-100k/u.occupation", sep = "\t", header = None)
occupation_data.columns = ['occupation']
occupation_index_map = {}
for index, row in occupation_data.iterrows():
    occupation_index_map.setdefault(row.occupation, index)

In [270]:
gender_index_map = {"M":0, "F":1}
user_data['gender_index'] = user_data['gender'].apply(lambda x: gender_index_map[x])
user_data['occupation_index'] = user_data['occupation'].apply(lambda x: occupation_index_map[x])

zip_code_index_map = {value:idx for idx,value in enumerate(list(user_data["zip_code"].unique()))}
user_data['zip_code_index'] = user_data['zip_code'].apply(lambda x: zip_code_index_map[x])
user_data.drop(columns = ['gender', 'occupation', 'zip_code'], inplace = True)

#rating_data['target'] = (rating_data['rating']>=4).astype(int)
rating_data.drop(columns = ['timestamp'], inplace = True)

In [271]:
pos_sample = rating_data.merge(user_data, left_on='user_id', right_on='user_id').merge(item_data, left_on='item_id', right_on='movie_id')
pos_sample.drop(columns = ['item_id'], inplace = True)

In [283]:
movie_click = {}

for _, row in pos_sample.iterrows():
    movie_click.setdefault(row.movie_id, 0)
    movie_click[row.movie_id] += 1
for key in movie_click:
    movie_click[key] = movie_click[key] ** 0.75

neg_sample = pd.DataFrame()
init_neg_sample = False
count = 0
for user_id, history in pos_sample.groupby('user_id'):
    view_dict = {key: value for key, value in movie_click.items() if key not in history.movie_id.to_list()}
    view_dict_value = list(view_dict.values())
    sum_prob = sum(view_dict_value)
    neg_sample_select = np.random.choice(list(view_dict.keys()),
                                         size=min(len(view_dict.keys()),len(history)*2),
                                         p = [i / sum_prob for i in view_dict_value], 
                                         replace=False)
    
    neg_user_tmp = pd.DataFrame(user_data.loc[user_data.user_id == user_id].values.reshape(1,-1).repeat(len(history)*2, axis = 0))
    neg_user_tmp.columns = user_data.columns
    
    neg_sample_tmp = pd.concat([neg_user_tmp.reset_index(),
                            item_data[item_data['movie_id'].isin(neg_sample_select)].reset_index()], axis = 1)
    neg_sample_tmp.drop(columns = ['index'], inplace = True)
    neg_sample_tmp.insert(1,'rating', [0 for _ in range(len(neg_user_tmp))])
    if not init_neg_sample:
        neg_sample = neg_sample_tmp
        init_neg_sample = True
    else:
        neg_sample = pd.concat([neg_sample, neg_sample_tmp])

neg_sample.reset_index(inplace = True)

Unnamed: 0,user_id,rating,age,gender_index,occupation_index,zip_code_index,movie_id,unknown,Action,Adventure,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year,release_month
0,1,0,24,0,19,0,273.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1995.0,1.0
1,1,0,24,0,19,0,274.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1995.0,1.0
2,1,0,24,0,19,0,275.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1995.0,1.0
3,1,0,24,0,19,0,276.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1995.0,1.0
4,1,0,24,0,19,0,277.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1995.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,943,0,22,0,18,794,1489.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,1.0
332,943,0,22,0,18,794,1490.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1993.0,1.0
333,943,0,22,0,18,794,1511.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1997.0,5.0
334,943,0,22,0,18,794,1515.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1995.0,1.0


In [278]:
1682 - 636

1046

In [264]:
user_id

2

In [251]:
pos_sample.columns

Index(['user_id', 'rating', 'age', 'gender_index', 'occupation_index',
       'zip_code_index', 'movie_id', 'unknown', 'Action', 'Adventure',
       'Animation', 'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', 'release_year',
       'release_month'],
      dtype='object')

In [252]:
neg_sample.columns

Index(['user_id', 'rating', 'age', 'gender_index', 'occupation_index',
       'zip_code_index', 'movie_id', 'unknown', 'Action', 'Adventure',
       'Animation', 'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', 'release_year',
       'release_month'],
      dtype='object')

In [254]:
pos_sample.columns == neg_sample.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [55]:
import torch.nn.functional as F
class DSSM(nn.Module):
    def __init__(self, embedding_dim, vocab_size, user_dnn_size = (64,32),
                item_dnn_size = (64,32), dropout = 0.0):
        super(DSSM, self).__init__()
        total_feature = user_feature_columns + item_feature_columns
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        nn.init.normal_(self.embedding.weight, mean = 0, std = 0)
        
        user_DNN_input_dim = sum()
        self.user_DNN = nn.sequential(
            nn.Linear(user_DNN_input_dim, user_dnn_size[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(user_dnn_size[0], user_dnn_size[1])
        )
        
        item_DNN_input_dim = sum()
        self.item_DNN = nn.sequential(
            nn.Linear(item_DNN_input_dim, item_dnn_size[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(item_dnn_size[0], item_dnn_size[1])
        )
    def user_embedding(self, X_user):
        user_emb = self.embedding.weight[X_user].reshape(X_user.shape[0], -1)
        return self.user_DNN(user_emb)
    def item_embedding(self, X_item):
        item_emb = self.embedding.weight[X_item].reshape(X_item.shape[0], -1)
        return self.item_DNN(item_emb)
    def forward(self, X_user, X_item):
        user_out_emb = self.user_embedding(X_user)
        item_out_emb = self.item_embedding(X_item)
        
        return F.cosine_similarity(user_out_emb, item_out_emb)

        

In [31]:
class Tower(nn.Module):
    def __init__(self, dnn_size = (256,128), dropout = 0.0, 
                activation = "ReLU"):
        super().__init__()
        self.dnns = nn.ModuleList()
        self.embeddings = EmbeddingModule(datatypes)
        input_dims = self.embeddings.sparse_dim + self.embeddings.dense_dim
        for dim in dnn_size:
            self.dnns.append(nn.Linear(input_dims, dim))
            self.dnns.append(nn.Dropout(dropout))
            self.dnns.append(get_activation(activation))
            input_dims = dim
        
    def forward(self, x):
        dnn_input = self.embeddings(x)
        for dnn in self.dnns:
            dnn_input = dnn(dnn_input)
        return dnn_input

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year,release_month
265,266,Kull the Conqueror (1997),29-Aug-1997,,http://us.imdb.com/M/title-exact?Kull+the+Conq...,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1997.0,8.0
267,268,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1997.0,1.0
268,269,"Full Monty, The (1997)",01-Jan-1997,,http://us.imdb.com/M/title-exact?Full+Monty%2C...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1997.0,1.0


In [112]:

# 假设词汇表大小为 100，嵌入维度为 50
vocab_size = 100
embedding_dim = 50

# 创建一个 Embedding 层，参数分别为词汇表大小和嵌入维度
embedding_layer = nn.Embedding(vocab_size, 2)

# 定义一个输入序列，假设长度为 5
input_sequence = torch.tensor([1, 4, 10, 20, 30])

# 使用 Embedding 层对输入序列进行嵌入
embedded_sequence = embedding_layer(input_sequence)

# 输出嵌入后的序列形状
print(embedded_sequence.shape)  # 输出：torch.Size([5, 50])

torch.Size([5, 2])


In [119]:
embedding_layer = nn.Embedding(5, 5)


In [120]:
embedding_layer( torch.tensor([1,4,2,2,2,2]))

tensor([[ 1.9929, -0.4436, -0.0542, -0.2881, -0.8071],
        [-0.0117,  0.0163, -1.3153, -0.4596,  0.3781],
        [ 3.4998, -0.8703, -1.4318, -0.7753,  0.5163],
        [ 3.4998, -0.8703, -1.4318, -0.7753,  0.5163],
        [ 3.4998, -0.8703, -1.4318, -0.7753,  0.5163],
        [ 3.4998, -0.8703, -1.4318, -0.7753,  0.5163]],
       grad_fn=<EmbeddingBackward0>)