In [2]:
import pandas as pd
import numpy as np
import ast
import os

from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
df = pd.read_csv('data/data.csv')
df['genres'] = df['genres'].apply(ast.literal_eval)
df['genres'] = df['genres'].apply(lambda x: [item.lower() for item in x])

In [4]:
genres = open('genres.txt', 'r').readlines()
genres = list(map(lambda g: g.strip(), genres))
genres[:5]

['action', 'adventure', 'animation', 'anime', 'british shows']

In [4]:
def generate_validation_data() -> None:
    N = 1000
    validation_df = pd.DataFrame(columns=df.columns)
    for g in genres:
        selected_movies = df[df['genres'].apply(lambda x: g in x)]
        size = int(N *  (selected_movies.shape[0] / df.shape[0]))
        validation_df = pd.concat([validation_df, selected_movies.sample(size, random_state=42)])

    validation_df.reset_index(drop=True).to_csv('data/valid_data.csv', index=False)

In [5]:
valid_df = pd.read_csv('data/valid_data.csv')
valid_df['genres'] = valid_df['genres'].apply(ast.literal_eval)
valid_df

Unnamed: 0,title,genres,year,description
0,Black Panther,"[action, adventure, sci-fi]",2017,"Thousands of years ago, five African tribes wa..."
1,Appleseed (Appurushîdo),"[action, animation, fantasy, sci-fi]",2004,"Deunan Knute, a young soldier and one of the G..."
2,Beverly Hills Cop,"[action, comedy, crime, drama]",1984,"Young, reckless, yet experienced Detroit Polic..."
3,Behind Enemy Lines II: Axis of Evil,"[action, thriller, war]",2006,The story is not linked to the first part of t...
4,Green Zone,"[action, drama, thriller, war]",2010,"On March 19, 2003, Iraqi Army General Mohammed..."
...,...,...,...,...
1911,Dust,"[drama, western]",2001,"A New York thief, a tough-as-nails hundred-yea..."
1912,"Apple Dumpling Gang Rides Again, The","[children, comedy, western]",1979,Amos Tucker (Conway) and Theodore Ogelvie (Kno...
1913,Slow West,"[action, thriller, western]",2015,"Jay Cavendish, a young Scotsman, travels to th..."
1914,"Outlaw Josey Wales, The","[action, adventure, drama, thriller, western]",1976,"Josey Wales, a Missouri farmer, is driven to r..."


In [6]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from sentence_transformers import SentenceTransformer
import logging
logging.getLogger().setLevel(logging.ERROR)


class CustomEmbedder:
    def __init__(self, embedder) -> None:
        self.embedder = embedder

    def get_embedding(self, text: str) -> list:
        embeddings = self.embedder.encode(text, convert_to_tensor=False)
        return embeddings.tolist()
    
    def embed_documents(self, documents: list) -> list:
        return [self.get_embedding(doc) for doc in documents]

    def embed_query(self, query: str) -> list:
        return self.get_embedding(query)


def fixed_size_chunking(description: str, embedder: SentenceTransformer) -> list:
    text_splitter = CharacterTextSplitter(
                        separator="\n",
                        chunk_size=1024,
                        chunk_overlap=20
                    )
    docs = text_splitter.split_text(description)
    return docs

def semantic_chunking(description: str, embedder: SentenceTransformer) -> list:
    emb = CustomEmbedder(embedder)
    text_splitter = SemanticChunker(emb)
    docs = text_splitter.split_text(description)
    return docs

def recursive_chunking(description: str, embedder: SentenceTransformer) -> list:
    text_splitter = RecursiveCharacterTextSplitter(
                                chunk_size=1024,
                                chunk_overlap=20,
                                length_function=len,
                                is_separator_regex=False,
                            )
    docs = text_splitter.split_text(description)
    return docs

def get_embedding(embedder: SentenceTransformer, text: str) -> list:
    embeddings = embedder.encode(text, convert_to_tensor=False)
    return embeddings.tolist()

def insert_embeddings(df: pd.DataFrame, embedder: SentenceTransformer, title: str, descriptions: list) -> pd.DataFrame:
    for description in descriptions:
        embedding = get_embedding(embedder=embedder, text=description)
        df = pd.concat([df, pd.DataFrame({'title': [title], 'description': [description], 'embedding': [embedding]})])
    return df

def embed_movies(df: pd.DataFrame, embedder_name: str, splitter_name: str):
    splitter_functions = {
        'fixed-size-splitter': fixed_size_chunking,
        'semantic-splitter': semantic_chunking,
        'recursive-splitter': recursive_chunking
    }
    embedder = SentenceTransformer(embedder_name)
    df['description_chunks'] = df['description'].progress_apply(lambda x: splitter_functions[splitter_name](x, embedder))

    embedding_df = pd.DataFrame({'title': [], 'description': [], 'embedding': []})

    for (i, (_, row)) in zip(tqdm(range(len(df))), df.iterrows()):
        title = row['title']
        descriptions = row['description_chunks']
        embedding_df = insert_embeddings(embedding_df, embedder, title, descriptions)

    os.makedirs(f'embeddings/{splitter_name}', exist_ok=True)
    os.makedirs(f'embeddings/{splitter_name}/{embedder_name}', exist_ok=True)
    embedding_df.to_csv(f'embeddings/{splitter_name}/{embedder_name}/embeddings.csv', index=False)

In [64]:
embed_movies(
    df=valid_df,
    embedder_name='all-MiniLM-L6-v2',
    splitter_name='fixed-size-splitter'
    )

100%|██████████| 18136/18136 [36:29<00:00,  8.28it/s] 


In [73]:
embed_movies(
    df=valid_df,
    embedder_name='gtr-t5-base',
    splitter_name='fixed-size-splitter'
    )

100%|██████████| 18136/18136 [3:13:12<00:00,  1.56it/s]  


In [5]:
embed_movies(
    df=valid_df,
    embedder_name='bert-base-nli-mean-tokens',
    splitter_name='fixed-size-splitter'
    )

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

100%|██████████| 18136/18136 [1:39:29<00:00,  3.04it/s] 


In [16]:
embed_movies(
    df=valid_df,
    embedder_name='all-MiniLM-L6-v2',
    splitter_name='semantic-splitter'
    )

  0%|          | 0/1916 [00:00<?, ?it/s]

  0%|          | 0/1916 [00:00<?, ?it/s]

In [18]:
embed_movies(
    df=valid_df,
    embedder_name='gtr-t5-base',
    splitter_name='semantic-splitter'
    )

  0%|          | 0/1916 [00:00<?, ?it/s]

  0%|          | 0/1916 [00:00<?, ?it/s]

In [20]:
embed_movies(
    df=valid_df,
    embedder_name='bert-base-nli-mean-tokens',
    splitter_name='semantic-splitter'
    )

  0%|          | 0/1916 [00:00<?, ?it/s]

  0%|          | 0/1916 [00:00<?, ?it/s]

In [23]:
embed_movies(
    df=valid_df,
    embedder_name='all-MiniLM-L6-v2',
    splitter_name='recursive-splitter'
    )

  0%|          | 0/1916 [00:00<?, ?it/s]

  0%|          | 0/1916 [00:00<?, ?it/s]

In [25]:
embed_movies(
    df=valid_df,
    embedder_name='gtr-t5-base',
    splitter_name='recursive-splitter'
    )

  0%|          | 0/1916 [00:00<?, ?it/s]

  0%|          | 0/1916 [00:00<?, ?it/s]

In [27]:
embed_movies(
    df=valid_df,
    embedder_name='bert-base-nli-mean-tokens',
    splitter_name='recursive-splitter'
    )

  0%|          | 0/1916 [00:00<?, ?it/s]

  0%|          | 0/1916 [00:00<?, ?it/s]

In [61]:
emb_name = 'all-MiniLM-L6-v2'
emb_name = 'gtr-t5-base'
emb_name = 'bert-base-nli-mean-tokens'

chunking = "fixed-size-splitter"
chunking = "recursive-splitter"
chunking = "semantic-splitter"

path = f'embeddings/{chunking}/{emb_name}/embeddings.csv'
emb = pd.read_csv(path)
emb['embedding'] = emb['embedding'].apply(ast.literal_eval)
emb

Unnamed: 0,title,description,embedding,year
0,Black Panther,"'Thousands of years ago, five African tribes w...","[0.05258965864777565, 1.1091506481170654, -1.3...",2017
1,Black Panther,"'A firefight erupts, and Klaue attempts to fle...","[-0.4872041344642639, 0.5775945782661438, 0.94...",2017
2,Black Panther,"""In a mid-credits scene, T'Challa appears befo...","[0.2455192357301712, 0.720119833946228, 0.7539...",2017
3,Appleseed (Appurushîdo),"""Deunan Knute, a young soldier and one of the ...","[-0.270588219165802, 0.45870858430862427, -0.4...",2004
4,Appleseed (Appurushîdo),"""While there, Deunan joins the counter-terrori...","[-0.23201650381088257, 0.535647451877594, -0.7...",2004
...,...,...,...,...
4150,"Outlaw Josey Wales, The",'They include an old Cherokee man named Lone W...,"[-0.6129461526870728, 0.49364399909973145, 0.1...",1976
4151,"Outlaw Josey Wales, The",'I guess we all died a little in that damned w...,"[0.3415108621120453, 0.13879993557929993, 0.51...",1976
4152,"Ox-Bow Incident, The","""In Bridger's Wells, Nevada in 1885, two cowbo...","[0.12608852982521057, 0.5382971167564392, 0.02...",1943
4153,"Ox-Bow Incident, The","""His son, Gerald, comes with him. Soon, news a...","[0.04774336516857147, 0.6165558099746704, 0.06...",1943


In [62]:
def clean_genres(genres_list: list):
    cleaned_genres = []
    for g in genres_list:
        g = g.lower().replace("'", "").replace("-", "_").replace(" ", "_")
        if "&" in g:
            g = g.split('&')
            g = list(map(lambda genre: genre.strip(), g))
            g = list(filter(lambda genre: genre in genres, g))
            cleaned_genres.extend(g)
        elif g in genres:
            cleaned_genres.append(g)
    return cleaned_genres

def make_list(genres_list):
    return genres_list.split(',')

emb['genres'] = None

for title in emb['title'].unique():
    g = valid_df[valid_df['title'] == title]['genres'].iloc[0]
    emb.loc[emb['title'] == title, 'genres'] = ",".join(g)

emb['genres'] = emb['genres'].apply(make_list)
emb['genres'] = emb['genres'].apply(clean_genres)