In [4]:
import os
import torch
import torch.nn as nn
import pandas as pd
from pathlib import Path
from datetime import datetime
from transformers import pipeline, BertModel
from data_process import DatasetPrepare
from tqdm import tqdm

class TextEmbedder(nn.Module):
    def __init__(self, embedding_dim, gpu_num):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.word_embedder = pipeline('feature-extraction',model='../bert-base-uncased')
        self.fc = nn.Linear(768, embedding_dim)
        self.dropout = nn.Dropout(0.1)
        self.gpu_num = gpu_num

    def forward(self, text):
        textual_description = text

        # Use BERT to extract features
        word_embeddings = self.word_embedder(textual_description)

        # BERT gives us embeddings for [CLS] ..  [EOS], which is why we only average the embeddings in the range [1:-1]
        # We're not fine tuning BERT and we don't want the noise coming from [CLS] or [EOS]
        word_embeddings = [torch.FloatTensor(x[0][1:-1]).mean(axis=0) for x in word_embeddings]
        word_embeddings = torch.stack(word_embeddings)

        # Embed to our embedding space
        word_embeddings = self.dropout(self.fc(word_embeddings))

        return word_embeddings


In [11]:
comment_embedding = TextEmbedder(32, 0)

Some weights of the model checkpoint at ../bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
data_df = pd.read_excel('dataset/data.xlsx')
test_set = data_df.sample(n=64, random_state=21)
train_set = data_df.drop(test_set.index)
test_df = test_set.reset_index(drop=True)
test_df.to_excel('dataset/test.xlsx',encoding='utf-8',index=False)
train_df = train_set.reset_index(drop=True)
train_df.to_excel('dataset/train.xlsx',encoding='utf-8',index=False)

In [8]:
comments = pd.read_excel('dataset/comment.xlsx')

In [13]:
# get comment 
comments_text_encoding = torch.zeros(len(train_df), 100, 32)

# Read the descriptions and the images
image_features, movies_description = [], []
for (idx, row) in tqdm(train_df.iterrows(), total=len(train_df), ascii=True):
    name = row['name']

    # get comment
    comment = comments[comments['name'] == name].head(100)
    comment_list = list(comment['recommend'])
    comment_text_encoding = comment_embedding(comment_list)
    comments_text_encoding[idx] = comment_text_encoding


100%|##########| 642/642 [39:35<00:00,  3.70s/it]


In [14]:
comments_text_encoding.size()

torch.Size([642, 100, 32])

In [15]:
torch.save(comments_text_encoding,'dataset/train_comment_text_encoding.pth')

In [16]:
# get comment 
test_comment_text_encoding = torch.zeros(len(test_df), 100, 32)

# Read the descriptions and the images
for (idx, row) in tqdm(test_df.iterrows(), total=len(test_df), ascii=True):
    name = row['name']

    # get comment
    comment = comments[comments['name'] == name].head(100)
    comment_list = list(comment['recommend'])
    comment_text_encoding = comment_embedding(comment_list)
    test_comment_text_encoding[idx] = comment_text_encoding

100%|##########| 64/64 [03:56<00:00,  3.69s/it]


In [17]:
torch.save(test_comment_text_encoding,'dataset/test_comment_text_encoding.pth')

In [18]:
test_comment_text_encoding.size()

torch.Size([64, 100, 32])

In [41]:
test_comment =torch.load('dataset/test_comment_text_encoding.pth')

In [42]:
test_comment

tensor([[[-0.5094, -0.0366,  0.3463,  ..., -0.0930,  0.5772,  0.1923],
         [-0.3726,  0.2514,  0.5061,  ...,  0.2774,  0.5547,  0.4254],
         [-0.7806, -0.1106,  0.5456,  ...,  0.5057,  0.8668,  0.3378],
         ...,
         [-0.7806, -0.1106,  0.5456,  ...,  0.5057,  0.8668,  0.3378],
         [-0.7806, -0.1106,  0.5456,  ...,  0.5057,  0.8668,  0.3378],
         [-0.7806, -0.1106,  0.5456,  ...,  0.0000,  0.0000,  0.3378]],

        [[-0.3726,  0.2514,  0.5061,  ...,  0.2774,  0.5547,  0.4254],
         [-0.0000, -0.1106,  0.5456,  ...,  0.5057,  0.8668,  0.3378],
         [-0.3726,  0.2514,  0.5061,  ...,  0.2774,  0.5547,  0.0000],
         ...,
         [-0.3726,  0.0000,  0.0000,  ...,  0.2774,  0.5547,  0.0000],
         [-0.3726,  0.2514,  0.0000,  ...,  0.2774,  0.5547,  0.4254],
         [-0.3726,  0.2514,  0.5061,  ...,  0.2774,  0.0000,  0.0000]],

        [[-0.0000, -0.0000,  0.5456,  ...,  0.5057,  0.8668,  0.3378],
         [-0.5094, -0.0366,  0.3463,  ..., -0

In [25]:
introduction_embedding = TextEmbedder(32, 0)

Some weights of the model checkpoint at ../bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
import ast
movies_description = []
for (idx, row) in tqdm(train_df.iterrows(), total=len(train_df), ascii=True):
    id, name, director, actor, introduction = row['id'], row['name'], row['director'], row['actor'], row['introduction']

    # get movies description
    director = ' '.join(ast.literal_eval(director))
    actor = ' '.join(ast.literal_eval(actor))
    # description_tmp = director + actor + introduction
    description_tmp = 'director:' + director +'actor:' + actor
    movies_description.append(description_tmp)


100%|##########| 642/642 [00:00<00:00, 5618.90it/s]


In [32]:
introduction_emcoding = introduction_embedding(movies_description)

In [34]:
introduction_emcoding.size()

torch.Size([642, 32])

In [35]:
torch.save(introduction_emcoding,'dataset/train_introduction_emcoding.pth')

In [36]:
import ast
movies_description1 = []
for (idx, row) in tqdm(test_df.iterrows(), total=len(test_df), ascii=True):
    id, name, director, actor, introduction = row['id'], row['name'], row['director'], row['actor'], row['introduction']

    # get movies description
    director = ' '.join(ast.literal_eval(director))
    actor = ' '.join(ast.literal_eval(actor))
    # description_tmp = director + actor + introduction
    description_tmp = 'director:' + director +'actor:' + actor
    movies_description1.append(description_tmp)

100%|##########| 64/64 [00:00<00:00, 3173.37it/s]


In [37]:
introduction_emcoding1 = introduction_embedding(movies_description1)

In [38]:
introduction_emcoding1.size()

torch.Size([64, 32])

In [39]:
torch.save(introduction_emcoding1,'dataset/test_introduction_emcoding.pth')

In [40]:
introduction_emcoding1

tensor([[-0.1838,  0.2696,  0.4917,  ...,  0.3358, -0.4374,  0.4716],
        [-0.0302,  0.3285,  0.5706,  ...,  0.3833, -0.4051,  0.5133],
        [ 0.0000,  0.3760,  0.6099,  ...,  0.4628, -0.3481,  0.5542],
        ...,
        [ 0.0182,  0.4092,  0.6156,  ...,  0.4188, -0.3816,  0.4991],
        [ 0.0331,  0.3868,  0.5983,  ...,  0.0000, -0.4241,  0.4429],
        [-0.0000,  0.3850,  0.3614,  ...,  0.3208, -0.4255,  0.4127]],
       grad_fn=<MulBackward0>)