In [None]:



from google.colab import drive
drive.mount('/content/drive')
     
Mounted at /content/drive

import pandas as pd

# Load the dataset
path = r'/content/drive/MyDrive/cleaned_subtitles.parquet'
df = pd.read_parquet(path)
     

df.head(3)
     
num	name	content
0	9251120	maybe.this.time.(2014).eng.1cd	Watch any video online with OpenSUBTITLES Free...
1	9211589	down.the.shore.s01.e10.and.justice.for.all.(19...	Oh I know that its getting late but I dont wan...
2	9380845	uncontrollably.fond.s01.e07.heartache.(2016).e...	Timing and Subtitles by The Uncontrollable Lov...
Chunking Subtitle Text for Efficient Processing

To improve text processing and search optimization, we split long subtitle texts into smaller, manageable chunks. This helps in better indexing, semantic search, and retrieval of relevant information.

For chunking, we use LangChain’s RecursiveCharacterTextSplitter, which ensures that text is divided into segments while maintaining meaningful context.


from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents_langchain(df, chunk_size=512, chunk_overlap=100):
    """Chunks documents using Langchain and returns a new DataFrame with metadata."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    chunks = []
    metadatas = []
    for index, row in df.iterrows():
        doc_chunks = text_splitter.split_text(row["content"])
        chunks.extend(doc_chunks)
        # Store subtitle_id and subtitle_name in the metadata
        metadatas.extend([{"original_index": index,
                          "subtitle_id": row["num"],
                          "subtitle_name": row["name"]}] * len(doc_chunks))

    chunk_df = pd.DataFrame({"chunk": chunks, "metadata": metadatas})
    return chunk_df
     

# Example usage (assuming 'new_df' has a 'content' column):
chunked_df = chunk_documents_langchain(df)

# Print the resulting DataFrame:
print(chunked_df.head(3))
     
                                               chunk  \
0  Watch any video online with OpenSUBTITLES Free...   
1  on Keep dancing Whatever Im kidding Dont get m...   
2  And you Douche Handsome Conceited Just like yo...   

                                            metadata  
0  {'original_index': 0, 'subtitle_id': 9251120, ...  
1  {'original_index': 0, 'subtitle_id': 9251120, ...  
2  {'original_index': 0, 'subtitle_id': 9251120, ...  

chunked_df.head()
     
chunk	metadata
0	Watch any video online with OpenSUBTITLES Free...	{'original_index': 0, 'subtitle_id': 9251120, ...
1	on Keep dancing Whatever Im kidding Dont get m...	{'original_index': 0, 'subtitle_id': 9251120, ...
2	And you Douche Handsome Conceited Just like yo...	{'original_index': 0, 'subtitle_id': 9251120, ...
3	that so Yes How long will this program run If ...	{'original_index': 0, 'subtitle_id': 9251120, ...
4	her Gramps Uncle Erning Aunt Elma this is Tep ...	{'original_index': 0, 'subtitle_id': 9251120, ...
Saving Chunked Subtitle Data

After splitting the subtitle content into smaller chunks, we need to store the processed data efficiently for future use. To do this, we follow these steps:

Create a Directory for Storage

We define a directory path /content/drive/MyDrive/search_engine/files/ to store the processed files.
If the directory does not exist, it is created using os.makedirs().
Save the Chunked Data in Parquet Format

The chunked DataFrame is saved as subtitles_extracted.parquet using PyArrow for efficient storage.
The Parquet format ensures better compression and faster read/write speeds.
Handle Errors Gracefully

A try-except block ensures that any errors during saving are caught and displayed.
Once the file is successfully stored, it is ready for further processing, search indexing, and retrieval in the search engine pipeline.


import os

path = "/content/drive/MyDrive/search_engine/files/"

# Check if the directory exists, create it if it doesn't
if not os.path.exists(path):
    os.makedirs(path)
    print(f"Directory created: {path}")
else:
    print(f"Directory already exists: {path}")

# Save the DataFrame as a Parquet file
try:
    chunked_df.to_parquet(f"{path}subtitles_extracted.parquet", engine="pyarrow", index=False)
    print("Successfully saved the dataset in:", path)
except Exception as e:
    print("Error:", e)
     
Directory already exists: /content/drive/MyDrive/search_engine/files/
Successfully saved the dataset in: /content/drive/MyDrive/search_engine/files/