# Usage
We ultimately want to fill in manga ratings that are NA or null.


We are choosing SBERT (Sentence BERT) as opposed to TFIDF cosine similarity because we want to predict ratings based on synopsis's that are semantically similar. It uses transformer-based architectures, such as BERT, that understand context and relationships between words, enabling it to capture the meaning of a sentence as a whole. TFIDF won't be a good method to use because it primiarily checks for word occurance and frequency which doesn't capture contexts or semantics.

In [16]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [19]:
df = pd.read_csv('../data/raw/manga.csv')
df = df.dropna(subset=['synopsis'])
print(df.isnull().sum())
# MODEL = 'all-mpnet-base-v2' # General-purpose, best for applications where accuracy is more important than speed.
# MODEL = 'paraphrase-MiniLM-L6-v2' # Balanced Performance. Paraphrase identification, text similarity tasks.
MODEL = 'paraphrase-mpnet-base-v2' # High-accuracy paraphrase identification and text similarity tasks. Slightly slower than MiniLM

manga_id                 0
title                    0
type                     0
score                21787
scored_by                0
status                   0
volumes              10655
chapters             11147
start_date            1618
end_date              9288
members                  0
favorites                0
sfw                      0
approved                 0
created_at_before        0
updated_at            1819
real_start_date       1618
real_end_date         9288
genres                   0
themes                   0
demographics             0
authors                  0
serializations           0
synopsis                 0
background           33479
main_picture            15
url                      0
title_english        28978
title_japanese        1002
title_synonyms           0
dtype: int64


In [20]:
# Assuming the column containing synopses is named 'synopsis'
df = df.dropna(subset=['synopsis'])
synopses = df['synopsis'].tolist()

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer(MODEL) 

# Compute embeddings for each synopsis
embeddings = model.encode(synopses, convert_to_tensor=True)

In [21]:
# You can convert it to a numpy array if needed:
embeddings_np = embeddings.cpu().numpy()

np.savez('../data/embeddings/synopsis_embeddings.npz', embeddings=embeddings_np)