# Read functions

In [1]:
import ast
import csv

def read_collected_movies(file_path):
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)
        movies_collected = []
        genres_collected = set()
        for row in csv_reader:
            # arrays
            row_genres = ast.literal_eval(row[2]) if row[2] else []
            row_ratings = ast.literal_eval(row[13]) if row[13] else []
            row_directors = row[5].split(', ') if row[5] else []
            row_writers = row[6].split(', ') if row[6] else []
            row_actors = row[7].split(', ') if row[7] else []
            row_languages = row[9].split(', ') if row[9] else []

            movies_collected.append({
                'movieId': row[0],
                'title': row[1],
                'genres': row_genres,
                'imdb_link': row[3],
                'name': row[4],
                'directors': row_directors,
                'writers': row_writers,
                'actors': row_actors,
                'plot': row[8],
                'languages': row_languages,
                'country_of_origin': row[10],
                'awards': row[11],
                'poster': row[12],
                'ratings': row_ratings,
            })
            genres_collected.update(row_genres)
        return movies_collected, genres_collected


def read_collected_series(file_path):
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)
        series_collected = []
        genres_collected = set()
        for row in csv_reader:
            # arrays
            row_genres = [genre.strip() for genre in row[6].split(', ') if genre.strip() and genre.strip() != 'N/A'] if row[6] else []
            ratings_row = ast.literal_eval(row[15]) if row[15] else []
            vote_average = (float(row[1]) / 2) if row[1] else 0.0
            vote_count = int(row[2]) if row[2] else 0
            directors_row = [director.strip() for director in row[7].split(', ') if director.strip() and director.strip() != 'N/A'] if row[7] else []
            writers_row = [writer.strip() for writer in row[8].split(', ') if writer.strip() and writer.strip() != 'N/A'] if row[8] else []
            actors_row = [actor.strip() for actor in row[9].split(', ') if actor.strip() and actor.strip() != 'N/A'] if row[9] else []
            row_languages = [language.strip() for language in row[11].split(', ') if language.strip() and language.strip() != 'N/A'] if row[11] else []

            series_collected.append({
                'series_id': row[0],
                'vote_average': vote_average,
                'vote_count': vote_count,
                'name': row[3],
                'year': row[4],
                'release_date': row[5],
                'genres': row_genres,
                'directors': directors_row,
                'writers': writers_row,
                'actors': actors_row,
                'plot': row[10],
                'languages': row_languages,
                'country_of_origin': row[12],
                'awards': row[13],
                'poster': row[14],
                'ratings': ratings_row,
                'imdb_link': row[16],
                'total_seasons': row[17]
            })
            genres_collected.update(row_genres)
        return series_collected, genres_collected



# Csv save functions

In [2]:
import csv

def save_movies_with_embedding_to_csv(movies_data, filename):
    header = [
        'movieId', 'title', 'genres', 'imdb_link', 'name', 'directors', 'writers', 'actors', 'plot',
        'languages', 'country_of_origin', 'awards', 'poster', 'ratings', 'embedding'
    ]

    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for movie in movies_data:
            try:
                ordered_movie = {key: movie.get(key, '') for key in header}
                writer.writerow(ordered_movie)
            except Exception as e:
                print(f"Error processing movie {movie['title']}: {e}")



def save_series_with_embedding_to_csv(series_data, filename):
    header = [
        'series_id', 'vote_average', 'vote_count', 'name', 'year', 'release_date', 'genres', 'directors',
        'writers', 'actors', 'plot', 'languages', 'country_of_origin', 'awards', 'poster',
        'ratings', 'imdb_link', 'total_seasons', 'embedding'
    ]

    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for serie in series_data:
            try:
                ordered_serie = {key: serie.get(key, '') for key in header}
                writer.writerow(ordered_serie)
            except Exception as e:
                print(f"Error processing serie {serie['name']}: {e}")


# Train data utils

In [3]:
import json


def create_train_data(movies: list[dict], series: list[dict], fields_to_use: list[str], save_filename: str,
                      train_filename: str):
    raw_train_data = read_raw_train_data(train_filename)
    train_data = []
    for data in raw_train_data:
        current_show = get_show(data['type'], data['key'], movies, series)
        current_show_text = create_show_text(current_show, fields_to_use)
        if data.get('most_similar'):
            similar_show = get_show(data['most_similar']['show_type'], data['most_similar']['show_key'], movies, series)
            similar_show_text = create_show_text(similar_show, fields_to_use)
            train_data.append({'text1': current_show_text, 'text2': similar_show_text,
                               'label': round(data['most_similar']['similarity'], 2)})
        if data.get('least_similar'):
            disimilar_show = get_show(data['least_similar']['show_type'], data['least_similar']['show_key'], movies,
                                      series)
            disimilar_show_text = create_show_text(disimilar_show, fields_to_use)
            train_data.append({'text1': current_show_text, 'text2': disimilar_show_text,
                               'label': round(data['least_similar']['similarity'], 2)})
    save_train_data(train_data, save_filename)


def save_train_data(train_data: list[dict], save_filename: str):
    with open(save_filename, 'w') as f:
        json.dump(train_data, f)


def create_show_text(show: dict, fields_to_use: list[str]):
    combined_text = []
    for field in fields_to_use:
        if isinstance(show[field], list):
            combined_text.append(', '.join(show[field]))
        else:
            combined_text.append(show[field])
    return ' '.join(combined_text)


def get_show(show_type, key, movies, series):
    if show_type == 'movie':
        return next((item for item in movies if item['movieId'] == key), None)
    else:
        return next((item for item in series if item['series_id'] == key), None)


def read_train_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        return data


def read_raw_train_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        return data[0]


# Embedding pipeline

## Variables

In [4]:
from enum import Enum


collected_movies_file_path = '/content/movies_collected_data.csv'
collected_series_file_path = '/content/series_collected_data.csv'

train_data_file = '/content/raw_train_data.json'
train_data_file_ext = '/content/raw_train_data_ext.json'


class EmbeddingType(Enum):
    NOMIC = 1
    SENT_TRANS = 3
    SENT_TRANS_EXT = 4


## Embedding creation functions

### Pip installations

In [None]:
!pip install -r "/content/requirements.txt"

In [6]:
from datasets import Dataset
from nomic import embed, atlas
import numpy as np
import time
import re
from sentence_transformers import SentenceTransformer, losses, SentenceTransformerTrainingArguments, \
    SentenceTransformerTrainer
from sklearn.model_selection import train_test_split


VECTOR_SIZE = 500


def create_embeddings_for_movies_nomic(movies, fields_to_use):
    """
        Generates embeddings for movies using selected fields and adds 'embedding' value
        to the dictionary for each element

        :param movies: List of movie dictionaries.
        :param fields_to_use: List of keys from the movie dictionaries to
        be used for embedding (e.g., ['plot', 'genres', 'actors', 'directors']). :return: The movie list with
        embeddings added for each movie.
    """
    texts_to_embed = []
    for movie in movies:
        # create text for embedding for each movie based on specified fields to use
        combined_text = []
        for field in fields_to_use:
            # if the field is a list join it with commas
            if isinstance(movie[field], list):
                combined_text.append(', '.join(movie[field]))
            else:
                combined_text.append(movie[field])
        texts_to_embed.append(' '.join(combined_text))
    start_time = time.time()

    # generate all embeddings
    output = embed.text(
        texts=texts_to_embed,
        model='nomic-embed-text-v1.5',
        task_type='search_document',
        inference_mode='local'  # Enable local inference
    )

    embeddings = np.array(output['embeddings'])

    # # generate embeddings in batches
    # embeddings = []
    # batch_size = 200
    # print(f'Generating embeddings in batches, batch size: {batch_size}')
    # batch_start_time = time.time()
    # batch_end_time = time.time()
    # for i in range(0, len(texts_to_embed), batch_size):
    #     batch_start_time = time.time()
    #     batch_texts = texts_to_embed[i:i + batch_size]
    #     output = embed.text(
    #         texts=batch_texts,
    #         model='nomic-embed-text-v1.5',
    #         task_type='search_document',
    #         inference_mode='local'  # Enable local inference
    #     )
    #     embeddings.extend(output['embeddings'])
    #     batch_end_time = time.time()
    #     elapsed_time = batch_end_time - batch_start_time
    #     print(f"Processed batch {i // batch_size + 1}/{(len(texts_to_embed) + batch_size - 1) // batch_size}")
    #     print(f"Time taken: {elapsed_time:.2f} seconds")
    #
    # embeddings = np.array(embeddings)

    print("All embeddings generated successfully!")
    end_time = time.time()
    elapsed_time_seconds = end_time - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print(f"Time taken: {elapsed_time_minutes:.2f} minutes")

    # assign embeddings to movies
    for i, movie in enumerate(movies):
        movie['embedding'] = embeddings[i].tolist()

    try:
        # Upload embeddings and metadata to Atlas for visualization
        metadata = [
            {'movieId': movie['movieId'], 'title': movie['title'], 'genres': ', '.join(movie['genres'])}
            for movie in movies
        ]
        field_names = ','.join(fields_to_use)
        atlas.map_data(embeddings=embeddings, data=metadata, identifier=f'Movies {field_names}')
    except Exception as e:
        print(e)

    return movies


def create_embeddings_for_series_nomic(series, fields_to_use):
    """
        Generates embeddings for series using selected fields and adds 'embedding' value
        to the dictionary for each element

        :param series: List of serie dictionaries.
        :param fields_to_use: List of keys from the serie dictionaries to
        be used for embedding (e.g., ['plot', 'genres', 'actors', 'directors']). :return: The serie list with
        embeddings added for each serie.
    """
    texts_to_embed = []
    for serie in series:
        # create text for embedding for each serie based on specified fields to use
        combined_text = []
        for field in fields_to_use:
            # if the field is a list join it with commas
            if isinstance(serie[field], list):
                combined_text.append(', '.join(serie[field]))
            else:
                combined_text.append(serie[field])
        texts_to_embed.append(' '.join(combined_text))
    start_time = time.time()

    # generate all embeddings
    output = embed.text(
        texts=texts_to_embed,
        model='nomic-embed-text-v1.5',
        task_type='search_document',
        inference_mode='local'  # Enable local inference
    )

    embeddings = np.array(output['embeddings'])

    print("All embeddings generated successfully!")
    end_time = time.time()
    elapsed_time_seconds = end_time - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print(f"Time taken: {elapsed_time_minutes:.2f} minutes")

    # assign embeddings to series
    for i, serie in enumerate(series):
        serie['embedding'] = embeddings[i].tolist()

    try:
        # Upload embeddings and metadata to Atlas for visualization
        metadata = [
            {'serieId': serie['series_id'], 'title': serie['name'], 'genres': ', '.join(serie['genres'])}
            for serie in series
        ]
        field_names = ','.join(fields_to_use)
        atlas.map_data(embeddings=embeddings, data=metadata, identifier=f'Series {field_names}')
    except Exception as e:
        print(e)

    return series


def create_embeddings_sentence_transformer(series, movies, fields_to_use, train_data):
    print("Fine tuning model")
    fine_tuned_model = fine_tune_model(train_data, 20)
    print("Generating movie embeddings")
    start_time = time.time()
    movies_with_embeddings = generate_embeddings_sentence_transformer(movies, fields_to_use, fine_tuned_model)
    end_time = time.time()
    elapsed_time_seconds = end_time - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print(f"Time taken: {elapsed_time_minutes:.2f} minutes")
    print("Generating series embeddings")
    start_time = time.time()
    series_with_embeddings = generate_embeddings_sentence_transformer(series, fields_to_use, fine_tuned_model)
    end_time = time.time()
    elapsed_time_seconds = end_time - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60
    print(f"Time taken: {elapsed_time_minutes:.2f} minutes")
    return movies_with_embeddings, series_with_embeddings


def fine_tune_model(full_train_data, epochs=1):
    """
    Fine-tune the SentenceTransformer model using the provided training data.

    Args:
        full_train_data (list): List of training examples with 'text1', 'text2', and 'label'.
        epochs (int): Number of epochs for fine-tuning.

    Returns:
        model (SentenceTransformer): Fine-tuned SentenceTransformer model.
    """
    model = SentenceTransformer('all-mpnet-base-v2')
    train_data, eval_data = train_test_split(full_train_data, test_size=0.1, random_state=42)

    train_dataset = Dataset.from_dict({
        'sentence1': [item['text1'] for item in train_data],
        'sentence2': [item['text2'] for item in train_data],
        'score': [item['label'] for item in train_data],
    })
    eval_dataset = Dataset.from_dict({
        'sentence1': [(item['text1']) for item in eval_data],
        'sentence2': [item['text2'] for item in eval_data],
        'score': [item['label'] for item in eval_data],
    })
    train_loss = losses.CosineSimilarityLoss(model=model)

    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir="/content/models/watchwise-recom-model-20-epoch",
        # Optional training parameters:
        num_train_epochs=epochs,
        warmup_ratio=0.1,
        # Optional tracking/debugging parameters:
        eval_strategy="steps",
        eval_steps=300,
        save_strategy="steps",
        save_steps=300,
        save_total_limit=5,
        load_best_model_at_end=True,
        logging_steps=100
    )
    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        loss=train_loss,
    )
    trainer.train()
    model.save_pretrained("/content/models/watchwise-recom-model-20-epoch/final")
    return model


def generate_embeddings_sentence_transformer(shows, fields_to_use, model):
    """
    Generate embeddings for a list of shows using specified fields.

    Args:
        shows (list): List of dictionaries, each representing a show.
        fields_to_use (list): List of field names to concatenate for embeddings.
        model (SentenceTransformer): SentenceTransformer model for generating embeddings.

    Returns:
        embeddings (np.array): Array of embeddings for each show.
    """
    show_texts = []
    for show in shows:
        combined_text = []
        for field in fields_to_use:
            if isinstance(show[field], list):
                combined_text.append(', '.join(show[field]))
            else:
                combined_text.append(show[field])
        show_texts.append(' '.join(combined_text))

    embeddings = model.encode(show_texts, show_progress_bar=True)
    for i, show in enumerate(shows):
        show['embedding'] = embeddings[i].tolist()

    return shows


## Embedding pipeline code

In [7]:

def create_embeddings(embedding_type: EmbeddingType, generate_train_data: bool = False):
    movies_collected, genres_collected_movies_set = read_collected_movies(collected_movies_file_path)
    series_collected, genres_collected_series_set = read_collected_series(collected_series_file_path)

    if embedding_type == EmbeddingType.NOMIC:
        # nomic AI
        movies_with_embeddings = create_embeddings_for_movies_nomic(
            movies=movies_collected, fields_to_use=['name', 'plot', 'genres', 'directors', 'actors']
        )
        save_movies_with_embedding_to_csv(movies_with_embeddings, '/content/movies_w_embedding_npgda_data.csv')

        series_with_embeddings = create_embeddings_for_series_nomic(
            series=series_collected, fields_to_use=['name', 'plot', 'genres', 'directors', 'actors']
        )
        save_series_with_embedding_to_csv(series_with_embeddings, '/content/series_w_embedding_npgda_data.csv')
    elif embedding_type == EmbeddingType.SENT_TRANS:
        # Sentence Transformers
        fields_to_use = ['name', 'plot', 'genres', 'directors', 'actors']
        save_filename = '/content/drive/MyDrive/Colab Notebooks/st_npgda_train.json'
        if generate_train_data:
            create_train_data(movies_collected, series_collected, fields_to_use, save_filename, train_data_file)
        train_data = read_train_data(save_filename)
        movies_with_embeddings_st, series_with_embeddings_st = create_embeddings_sentence_transformer(
            movies=movies_collected,  series=series_collected, train_data=train_data,
            fields_to_use=fields_to_use
        )
        save_movies_with_embedding_to_csv(movies_with_embeddings_st, '/content/movies_w_embedding_st_npgda_data.csv')
        save_series_with_embedding_to_csv(series_with_embeddings_st, '/content/series_w_embedding_st_npgda_data.csv')
    elif embedding_type == EmbeddingType.SENT_TRANS_EXT:
        # Sentence Transformers extended train data
        fields_to_use = ['name', 'plot', 'genres', 'directors', 'actors']
        save_filename = '/content/st_ext_npgda_train.json'
        if generate_train_data:
            create_train_data(movies_collected, series_collected, fields_to_use, save_filename, train_data_file_ext)
        train_data = read_train_data(save_filename)
        movies_with_embeddings_st, series_with_embeddings_st = create_embeddings_sentence_transformer(
            movies=movies_collected,  series=series_collected, train_data=train_data,
            fields_to_use=fields_to_use
        )
        save_movies_with_embedding_to_csv(movies_with_embeddings_st, '/content/movies_w_embedding_st_ext_20_epoch_npgda_data.csv')
        save_series_with_embedding_to_csv(series_with_embeddings_st, '/content/series_w_embedding_st_ext_20_epoch_npgda_data.csv')



## Running

In [8]:
create_embeddings(EmbeddingType.SENT_TRANS_EXT, False)

Fine tuning model


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
300,0.0047,0.003012
600,0.0026,0.00616
900,0.0013,0.002711
1200,0.0009,0.003134
1500,0.0006,0.002977
1800,0.0007,0.002521
2100,0.0005,0.002165
2400,0.0004,0.002196
2700,0.0004,0.002351
3000,0.0003,0.002317


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Generating movie embeddings


Batches:   0%|          | 0/303 [00:00<?, ?it/s]

Time taken: 1.75 minutes
Generating series embeddings


Batches:   0%|          | 0/220 [00:00<?, ?it/s]

Time taken: 0.92 minutes
