**IMPORT NECESSARY PACKAGES**

In [None]:
import pandas as pd

**MOUNTING THE DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**READING THE CSV FILE**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/eng_movie_subtitles.csv')

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,name,file_content
0,0,Criminal Minds S10 E20 A Place At The Table,script info title default file scripttype v400...
1,1,Ghost In The Shell Stand Alone Complex S01 E09...,1 angels and demons were circling over my head...
2,2,Halo S01 E04 Homecoming,script info title default file scripttype v400...
3,3,Make My Day S01 E07 Episode 1 7,support us and become vip member to remove all...
4,4,Australian Survivor S04 E11 Episode 4 11,1 ericsson access services 2 and ben soon turn...


In [None]:
df.rename(columns={'Unnamed: 0': 'index'}, inplace=True)

In [None]:
df.columns

Index(['index', 'name', 'file_content'], dtype='object')

**CHUNCKING & SENTENCE TRANSFORMERS:**

In order to convert the text data in the subtitle contents into vector, I have used BERT-based transformer since it contains an encoder that can understand the context and do embedding for each word.

Since the dataset is huge, there is a chance of missing few words, so the data is splitted into multiple chunks and the embedding is performed on each chunks.

In [None]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m122.9/171.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
from sentence_transformers import SentenceTransformer

def generate_embeddings(texts):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embeddings = []
    for text_chunk in texts:
        chunk_embeddings = model.encode(text_chunk)
        embeddings.append(chunk_embeddings)
    return embeddings

def document_chunker(data, chunk_size=500, overlap_size=50):
    chunks = []
    start_idx = 0
    while start_idx < len(data):
        end_idx = min(start_idx + chunk_size, len(data))
        chunk = ' '.join(data[start_idx:end_idx])
        chunks.append(chunk)
        start_idx += chunk_size - overlap_size
    return chunks

In [None]:
texts = df['file_content'].tolist()

In [None]:
chunked_texts = document_chunker(texts)

In [None]:
chunk_embeddings = generate_embeddings(chunked_texts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**STORING THE VECTORS IN CHROMADB**

The embedded vectors are stored in Chroma DB. ChromaDB is a database used for storing vectors.

In [None]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl (525 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/525.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/525.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.110.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chr

**CREATING A COLLECTION IN CHROMADB AND ADDING THE EMBEDDED VECTORS INTO IT:**

In [None]:
import chromadb

In [None]:
client = chromadb.PersistentClient(path="/content/drive/MyDrive/eng_subtitles")


In [None]:
collection = client.get_or_create_collection(name="eng_subtitles")

In [None]:
for index, row in df.iterrows():
    document_id = str(row['index'])  # Use a unique identifier for each document
    document_text = row['file_content']
    # document_embedding = corpus_embeddings[index % len(corpus_embeddings)]  # Assuming embeddings are aligned with the rows

    # ------------------------------------------------------------------------
    chunk_index = index % len(chunk_embeddings)
    document_embedding = chunk_embeddings[chunk_index]
    #--------------------------------------------------------------------------
    metadata = {'movie_name': row['name']}

    # Insert document into ChromaDB collection
    collection.add(ids=document_id, documents=[document_text], embeddings=[document_embedding.tolist()], metadatas=[metadata])


print("Insertion into ChromaDB collection complete")

Insertion into ChromaDB collection complete
