In [None]:
import sqlite3
import pandas as pd

In [None]:
# Read the code below and write your observation in the next cell

conn = sqlite3.connect(r"C:\Users\k.udayasagar\Downloads\eng_subtitles_database.db")
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

In [None]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

In [None]:
df = pd.read_sql_query("""SELECT * FROM zipfiles LIMIT 25000""", conn)
df.head()

In [None]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])
    
    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

In [None]:
df['file_content'] = df['content'].apply(decode_method)

df.head()

In [None]:
df=df[['name','file_content']]
df


In [None]:
import re

In [None]:
def preprocessing_data(text, n,zx):
    text = text.lower()
    # Define regex pattern to match serial numbers and timestamps
    pattern = r'\d+\s*?\r?\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\s*?\r?\n'
    
    # Replace matched pattern with an empty string to remove it
    cleaned_text = re.sub(pattern, '', text)
    
    # Find the index of the first empty line
    empty_line_index = cleaned_text.find('\n\n') + 2
    
    # Initialize count of removed non-empty lines
    removed_lines = 0
    
    # Find the indices of the first n non-empty lines after the empty line
    nonempty_line_indices = [m.start() for m in re.finditer(r'\n', cleaned_text[empty_line_index:]) if not cleaned_text[empty_line_index:][m.start()].isspace()]
    
    # Remove the first n non-empty lines after the empty line
    for index in nonempty_line_indices:
        cleaned_text = cleaned_text[:empty_line_index + index] + re.sub(r'^.*?(\r?\n)', '', cleaned_text[empty_line_index + index:], count=1)
        removed_lines += 1
        if removed_lines == n:
            break
    cleaned_text = re.sub(r'<[^>]*>', '', cleaned_text)
    cleaned_text = cleaned_text.replace('\r', ' ').replace('\n', ' ')
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)
    
    return cleaned_text


In [None]:
df['cleaned_content'] = df['file_content'].apply(preprocessing_data,args=(2,"abc"))

print(df[['name', 'cleaned_content']])

In [None]:
import pandas as pd

def generate_overlapping_chunks(text, window_size, overlap_size):
    words = text.split()  # Split text into words
    chunks = []
    start = 0
    while start < len(words):
        end = start + window_size
        if end > len(words):
            end = len(words)
        chunk = ' '.join(words[start:end])  # Join words to form chunk
        chunks.append(chunk)
        start += window_size - overlap_size
    return chunks

# Example usage
# Assuming df is your DataFrame with 'cleaned_content' and 'name' columns

# Define window size and overlap size
window_size = 500  # 500 words
overlap_size = 50  # Adjust this as needed

# Create a new DataFrame to store the chunks
chunked_df = pd.DataFrame(columns=['chunk', 'name'])

# Iterate over each row in the DataFrame
for text, name in zip(df['cleaned_content'], df['name']):
    # Generate overlapping chunks for the current text
    chunks = generate_overlapping_chunks(text, window_size, overlap_size)
    # Append the chunks and their original row index
    for chunk in chunks:
        chunked_df = chunked_df.append({'chunk': chunk, 'name': name}, ignore_index=True)

# Display the chunked DataFrame



In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
chunked_df['bert_vector'] = chunked_df['chunk'].apply(model.encode)

In [None]:
import numpy as np
import chromadb

import uuid  # Import the uuid module to generate unique IDs
client = chromadb.HttpClient(host='localhost', port=8000)
collection = client.create_collection(name="embedding_data")

for index, row in chunked_df.iterrows():
    # Extract information from the row
    name = row['name']
    bert_vector = row['bert_vector']
#     print(name)
#     print(bert_vector)
    
    # Convert the bert_vector from NumPy ndarray to list
    bert_vector_list = bert_vector.tolist()
    
    # Generate a unique ID for the chunk
    chunk_id = str(uuid.uuid4())  # Generate a UUID as the ID
    
    # Add the chunk to ChromaDB
    collection.add(ids=[chunk_id], embeddings=[bert_vector_list], metadatas=[{"name": name}])
