### Import PDF Documents 

In [4]:
import os
import requests

import pypdfium2 # Needs to be at the top to avoid warnings
import argparse
import os
import json
import textwrap
import torch

import numpy as np 
import pandas as pd
from tqdm.auto import tqdm 
from nltk import sent_tokenize

from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.output import save_markdown

from sentence_transformers import util, SentenceTransformer

# For instance of Qdriant data base.
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams

# from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient


from langchain_community.chat_models import ChatOllama



In [5]:
# Define device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
    
    
# Differents populars languages used in Africa
languages_targets = [
    'en',  # English
    'fr',  # French
    'ar',  # Arabic
    'es',  # Spanish
    'zh',  # Chinese
    'sw',  # Swahili (widely spoken in East Africa)
    'ha',  # Hausa (widely spoken in West Africa)
    'am',  # Amharic (spoken in Ethiopia)
    'pt',  # Portuguese (spoken in Mozambique and Angola)
    'yo',  # Yoruba (spoken in Nigeria)
    'zu'   # Zulu (spoken in South Africa)
]

In [6]:
# Download PDFs files

# Paths
docs_path = "Pdf_path/"
output_path = "Structured_files/"

files_path = []

try:

    
    # Traverse the folder and collect all file paths
    for file in os.listdir(docs_path):
        file_path = os.path.join(docs_path, file)
        if os.path.isfile(file_path):
            files_path.append(file_path)
            
except Exception as e : 
    print(e)
    


In [7]:
print(files_path)

['Pdf_path/islamic_law_of_marriage_and_inheritance_in_kenya.pdf', 'Pdf_path/ndulo2011.pdf', 'Pdf_path/The_changing_philosophy_of_African_marri.pdf', 'Pdf_path/A+Reflection+on+the+African+Traditional+Values+of+Marriage+and+Sexuality.pdf', 'Pdf_path/fulltext.pdf', 'Pdf_path/taylor_and_francis_group_women_law_and_human_rights_in_southern_africa.pdf', 'Pdf_path/African Customary Law Customs and Womens Rights.pdf', 'Pdf_path/The_Future_of_African_Customary_Law_by_Fenrich_J_G.pdf', 'Pdf_path/harcoam,+1.+MJ+Mafela.pdf', 'Pdf_path/first-page-pdf.pdf']


In [8]:


model_lst = load_all_models()



Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Loaded recognition model vikp/surya_rec on device mps with dtype torch.float16
Loaded texify model to mps with torch.float16 dtype


In [6]:
""" 
fname = files_path[0]
full_text, images, out_meta = convert_single_pdf(fname, model_lst)

fname = os.path.basename(fname)
subfolder_path = save_markdown('marker-output', fname, full_text, images, out_meta)

print(f"Saved markdown to the {subfolder_path} folder")
"""

' \nfname = files_path[0]\nfull_text, images, out_meta = convert_single_pdf(fname, model_lst)\n\nfname = os.path.basename(fname)\nsubfolder_path = save_markdown(\'marker-output\', fname, full_text, images, out_meta)\n\nprint(f"Saved markdown to the {subfolder_path} folder")\n'

### Chunking our sentences together

In [9]:
def check_chunk_quality(structured_chunks, min_word_count=30, max_token_count=384):
    """
    Filters out chunks that do not meet the quality criteria.
    
    Parameters:
    - structured_chunks: List of chunks to be checked.
    - min_word_count: Minimum word count required for a chunk to be considered valid.
    - max_token_count: Maximum token count allowed for a chunk to be considered valid.
    
    Returns:
    - filtered_chunks: List of chunks that pass the quality checks.
    """
    filtered_chunks = []

    for chunk in structured_chunks:
        # Check if the chunk meets the quality criteria
        if (chunk['chunk_word_count'] >= min_word_count) and (chunk['chunk_token_count'] <= max_token_count):
            filtered_chunks.append(chunk)

    return filtered_chunks

In [10]:
def append_to_json(file_path, new_data):
    """
    Appends new data to an existing JSON file. If the file doesn't exist, it creates one.
    
    Parameters:
    - file_path: Path to the JSON file.
    - new_data: List of dictionaries containing the new data to be appended.
    """
    # Load existing data if the file exists
    if os.path.exists(file_path):
        with open(file_path, 'r') as infile:
            existing_data = json.load(infile)
    else:
        existing_data = []

    # Append the new data to the existing data
    combined_data = existing_data + new_data

    # Save the combined data back into the JSON file
    with open(file_path, 'w') as outfile:
        json.dump(combined_data, outfile, indent=4)

In [11]:
# Configure logging (optional)
configure_logging()

docs_path = "Pdf_path/"
output_dir = "Structured_files"

max_token_limit = 384
token_char_ratio = 4  # Approximate characters per token
max_chunk_size = max_token_limit * token_char_ratio

# For Our json file of chunks
base_json = "tundah"

In [10]:
# Final list of chunk
filtered_chunks = []
# To get all chuncks directly when executincode
chunks_final_state = []

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to split text into chunks
def split_into_chunks(text, max_chunk_size):
    return textwrap.wrap(text, max_chunk_size)

# Traverse the folder and process each PDF file
for file in os.listdir(docs_path):
    file_path = os.path.join(docs_path, file)
    if os.path.isfile(file_path) and file.lower().endswith('.pdf'):
        try:
            # Perform PDF extraction
            print(f"Processing file: {file_path}")
            full_text, images, out_meta = convert_single_pdf(file_path, model_lst)
            
            # Split full text into chunks
            chunks = split_into_chunks(full_text, max_chunk_size)
            
            # Prepare chunks with metadata
            structured_chunks = []
            for i, chunk in enumerate(chunks):
                structured_chunks.append({
                    'page_number': out_meta.get('page_number', 'N/A'),
                    'sentence_chunk': chunk,
                    'chunk_char_count': len(chunk),
                    'chunk_word_count': len(chunk.split()),
                    'chunk_token_count': len(chunk) / token_char_ratio
                })
            
            # Check chunk quality
            filtered_chunks = check_chunk_quality(structured_chunks, min_word_count=30, max_token_count=384)
            chunks_final_state += filtered_chunks
            # Save extracted content
            fname = os.path.basename(file_path).replace('.pdf', '')
            subfolder_path = save_markdown(output_dir, fname, full_text, images, out_meta)
            
            # Create the JSON output path
            json_output_path = os.path.join(output_dir, f"{base_json}_chunks.json")
            
            # Append to JSON file
            append_to_json(json_output_path, filtered_chunks)
            
            print(f"Saved markdown and JSON to the {subfolder_path} folder")
        
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

print("Processing completed.")


Processing file: Pdf_path/islamic_law_of_marriage_and_inheritance_in_kenya.pdf


Detecting bboxes: 100%|██████████| 7/7 [00:05<00:00,  1.28it/s]
Detecting bboxes: 100%|██████████| 5/5 [00:04<00:00,  1.23it/s]
Finding reading order: 100%|██████████| 5/5 [00:11<00:00,  2.29s/it]


Saved markdown and JSON to the Structured_files/islamic_law_of_marriage_and_inheritance_in_kenya folder
Processing file: Pdf_path/ndulo2011.pdf


Detecting bboxes: 100%|██████████| 9/9 [00:06<00:00,  1.32it/s]
Detecting bboxes: 100%|██████████| 6/6 [00:05<00:00,  1.03it/s]
Finding reading order: 100%|██████████| 6/6 [00:15<00:00,  2.62s/it]


Saved markdown and JSON to the Structured_files/ndulo2011 folder
Processing file: Pdf_path/The_changing_philosophy_of_African_marri.pdf


Detecting bboxes: 100%|██████████| 3/3 [00:02<00:00,  1.36it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:02<00:00,  1.13s/it]
Finding reading order: 100%|██████████| 2/2 [00:09<00:00,  4.71s/it]


Saved markdown and JSON to the Structured_files/The_changing_philosophy_of_African_marri folder
Processing file: Pdf_path/A+Reflection+on+the+African+Traditional+Values+of+Marriage+and+Sexuality.pdf


Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.32it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
Finding reading order: 100%|██████████| 2/2 [00:05<00:00,  2.72s/it]

Saved markdown and JSON to the Structured_files/A+Reflection+on+the+African+Traditional+Values+of+Marriage+and+Sexuality folder
Processing file: Pdf_path/fulltext.pdf



Detecting bboxes: 100%|██████████| 7/7 [00:04<00:00,  1.44it/s]
Detecting bboxes: 100%|██████████| 5/5 [00:04<00:00,  1.04it/s]
Finding reading order: 100%|██████████| 5/5 [00:11<00:00,  2.36s/it]


Saved markdown and JSON to the Structured_files/fulltext folder
Processing file: Pdf_path/taylor_and_francis_group_women_law_and_human_rights_in_southern_africa.pdf


Detecting bboxes: 100%|██████████| 4/4 [00:03<00:00,  1.33it/s]
Detecting bboxes: 100%|██████████| 3/3 [00:03<00:00,  1.01s/it]
Finding reading order: 100%|██████████| 3/3 [00:10<00:00,  3.43s/it]


Saved markdown and JSON to the Structured_files/taylor_and_francis_group_women_law_and_human_rights_in_southern_africa folder
Processing file: Pdf_path/African Customary Law Customs and Womens Rights.pdf


Detecting bboxes: 100%|██████████| 12/12 [00:08<00:00,  1.35it/s]
Detecting bboxes: 100%|██████████| 8/8 [00:09<00:00,  1.18s/it]
Finding reading order: 100%|██████████| 8/8 [00:48<00:00,  6.11s/it]


Saved markdown and JSON to the Structured_files/African Customary Law Customs and Womens Rights folder
Processing file: Pdf_path/The_Future_of_African_Customary_Law_by_Fenrich_J_G.pdf


Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.32it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]
Finding reading order: 100%|██████████| 2/2 [00:04<00:00,  2.32s/it]


Saved markdown and JSON to the Structured_files/The_Future_of_African_Customary_Law_by_Fenrich_J_G folder
Processing file: Pdf_path/harcoam,+1.+MJ+Mafela.pdf


Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]
Finding reading order: 100%|██████████| 2/2 [00:06<00:00,  3.00s/it]


Saved markdown and JSON to the Structured_files/harcoam,+1 folder
Processing file: Pdf_path/first-page-pdf.pdf


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  6.49it/s]
Finding reading order: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]

Saved markdown and JSON to the Structured_files/first-page-pdf folder
Processing completed.





In [19]:
chunks_final_state[0]

{'page_number': 'N/A',
 'sentence_chunk': 'Journal of African Law, 65, 3 (2021), 377–401 © The Author(s), 2021. Published by Cambridge University Press on behalf of SOAS University of London. This is an Open Access article, distributed under the terms of the Creative Commons Attribution licence (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted reuse, distribution, and reproduction in any medium, provided the original work is properly cited.  doi:10.1017/S0021855321000346 First published online 1 October 2021  # The Islamic Law Of Marriage And Inheritance In Kenya  Jamil Ddamulira Mujuzi* University of the Western Cape, Bellville, South Africa E-mail: jdmujuzi@uwc.ac.za  ## Abstract  Article 24(4) of the Constitution of Kenya qualifies the right to equality "to the extent strictly necessary for the application of" Islamic law "in matters relating to personal status, marriage, divorce and inheritance". Section 3 of the Marriage Act provides that, although spouses

In [181]:
def load_json_to_dataframe(json_path):
    """
    Loads a JSON file into a Pandas DataFrame.

    Parameters:
    - json_path: Path to the JSON file.

    Returns:
    - df: DataFrame containing the data from the JSON file.
    """
    # Load the JSON data
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Convert the JSON data into a Pandas DataFrame
    df = pd.DataFrame(data)
    
    return df,  data


In [15]:
# Create a DataFrame to get stats
output_json = os.path.join(output_dir, f"{base_json}_chunks.json")
df, chunks_array_over_min_token_len = load_json_to_dataframe(output_json)
df.describe().round(2)

Unnamed: 0,chunk_char_count,chunk_word_count,chunk_token_count
count,276.0,276.0,276.0
mean,1512.38,244.01,378.1
std,124.22,26.84,31.05
min,533.0,82.0,133.25
25%,1531.0,236.0,382.75
50%,1533.0,247.5,383.25
75%,1535.0,258.0,383.75
max,1536.0,281.0,384.0


### Embedding our text chunks

In [12]:
#  File In which we registred the embeded vectors
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"

In [13]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device) # choose the device to load the model to

In [31]:
# Create embeddings one by one on the GPU
for item in tqdm(chunks_array_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])
    


  0%|          | 0/276 [00:00<?, ?it/s]

In [14]:
# Create embedding vectors for Qdrant
def create_embeddings(embedding_model, chunks):
    vectors = [embedding_model.encode(item["sentence_chunk"]) for item in chunks]
    return vectors

In [14]:
# Create embedding as a payload for Qdrant
payload =[embedding_model.encode(item["sentence_chunk"]) for item in chunks_final_state] 

In [32]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(chunks_array_over_min_token_len)

text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [19]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,,"Journal of African Law, 65, 3 (2021), 377–401 ...",1526,227,381.5,[ 3.58435959e-02 2.01791208e-02 -1.26003334e-...
1,,"inheritance, Kadhi's courts, freedom of testat...",1532,252,383.0,[ 1.19352154e-02 -1.17024928e-02 -6.72595855e-...
2,,"some human rights issues, especially the right...",1535,253,383.75,[ 3.26480567e-02 -1.76190734e-02 -4.02760692e-...
3,,jurisdiction of a Kadhi's court shall be limit...,1534,254,383.5,[ 2.04174574e-02 -5.64446859e-02 8.79021175e-...
4,,Muslim woman is completely prohibited from mar...,1530,257,382.5,[ 5.60534894e-02 3.24488827e-03 -9.06689372e-...


In [None]:
# # Get ours vectors
# import ast
# # embedding = text_chunks_and_embedding_df_load['embedding'].apply(lambda x: ast.literal_eval(x.replace('\n', '')))
# embedding = [ ast.literal_eval(elt) for elt in text_chunks_and_embedding_df_load["embedding"]]

# embedding

### Qdrant Vector DB

In [15]:
%%capture
%pip install --upgrade --quiet  qdrant-client langchain

In [20]:
# Initialize the Qdrant client
client = QdrantClient(url="http://localhost:6333")

collection_name="server_documents"

# Create or recreate the collection
# client.recreate_collection(
#     collection_name="server_documents",
#     vectors_config=VectorParams(size=768, distance=Distance.DOT),
# )

# Check if the collection exists
if not client.collection_exists(collection_name=collection_name):
    print(f"Collection '{collection_name}' does not exist. Creating it now.")
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=768, distance=Distance.DOT),
    )


AttributeError: module 'httpx' has no attribute 'Limits'

In [15]:
def insert_to_qdriant(vectors, meta_data):
    # Insert data
    for index, row in enumerate( zip(vectors, meta_data) ):
        client.upsert(
            collection_name="server_documents",
            wait=True,
            points=[
                PointStruct(id=index, vector=row[0], payload=row[1]),
            ],
        )

In [26]:
# Insert data
for index, row in enumerate( zip(payload, chunks_final_state) ):
    operation_info = client.upsert(
        collection_name="server_documents",
        wait=True,
        points=[
            PointStruct(id=index, vector=row[0], payload=row[1]),
        ],
    )



  client.recreate_collection(


### Similarity search / Semantic Search 

In [16]:
# query = "What are the traditional steps involved in a Luo customary marriage?" 
# query = "What are the traditional steps involved in Zimbabwe customary marriage?"
query = "What is the role of the family in traditional marriage in Wassoulou?"
query_vector = embedding_model.encode(query)



In [17]:
def search_documents(client, query, embedding_model):
    query_vector = embedding_model.encode(query)
    return client.search(
        collection_name="server_documents",
        query_vector=query_vector,
        with_vectors=True,
        with_payload=True,
        limit=3,
    )

context = search_documents(client, query, embedding_model)

NameError: name 'client' is not defined

In [253]:

def get_context(context: list) -> list[str]:

    return [shunk.payload["sentence_chunk"] for shunk in context]

get_context(context)

["an affair between a man and a woman.  Marriage involves the whole extended family and to some African communities, the village. Both the man and the woman must obtain approval from the entire family. For example, a young man would not personally approach his prospective father-in-law and ask for the daughter's hand in marriage. The young man would instead make his intentions known to the elders in his family, and they would approach the family of the young woman. Weinrich (1983:48) argues that … [a]lthough young people today try to choose their own spouses, their marriage remains the concern of their respective families and hardly any spouse dares to marry without their families' approval. They may exchange love tokens without informing their elders, but before finalizing their choice, they seek the consent of their families. The families' involvement in a marriage is still one of the strongest factors making for stability.  Mathu (1971:1) bases the definition of marriage on the inte

### Augmenting our prompt with context items

In [172]:
def get_prompt(query: str, context: list[str]) -> str:
    # context_str = '\n'.join(context)
    
    return f"""
    SYSTEM: You are an expert on African customary marriage laws. Answer the following questions about marriage practices, annulments, and implications of specific customs across different African tribes.

    Use the following pieces of context to answer the question at the end. Think step-by-step, and then answer. 

    Do not try to make up an answer:
    - If the context can help determine the answer, use it to form your response directly without introductory phrases like "I can determine the answer to that based on the provided context."
    - If the context can help determine the answer, use it to form your response. However, if the context is not useful, say "I cannot determine the answer to that."
    - If the context is empty, just say "I do not know the answer to that."
    - If explicitly requested or if you cannot provide a clear answer without explanation, then include the reasoning.

   
    ==================
    Context: {context}
    ==================

    ### Guidelines:
    1. **Condition Check**: Ensure that each response is accurate, culturally sensitive, and aligns with the specific customary practices of the tribe mentioned.
    2. **Step-by-Step Solution**: Before concluding, think through the cultural, legal, and social aspects of the question, and consider the potential impact on the families and communities involved. 

    Question: {query}
    Helpful Answer:"""


In [145]:
# def get_prompt(query: str, context: list[str]) -> str:
#     # context = '\n'.join(context)
    
#     # print("Display thr context :", context)
    

#     return f"""SYSTEM: You are an intelligent expert on African customary marriage helping the users with their questions.
            
#         Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.
            
#         Do not try to make up an answer:
#          - If the context can help determine the answer, use it to form your response. However, if the context is not useful, say "I cannot determine the answer to that."
#          -If the answer to the question cannot be determined from the context alone, say "I cannot determine the answer to that."
#          - If the context is empty, just say "I do not know the answerQ to that.
#          "
        
#         ==================
#         Context: {context}
#         ==================
        
#         Question: {query}
#         Helpful Answer:"""

In [255]:
def infer():
    # Define the model, Ensure this name matches the model you have
    local_model = "llama2:7b-chat-q4_0"  
    temperature = 0.3
    # local_model = "mistral"
    llm = ChatOllama(model=local_model, temperature=temperature)
    
    prompt = get_prompt(query=query, context=get_context(context))

    # Example input in the format expected by the model
    input_data =  [
        {"role": "user", "content": prompt}
    ]

    # Make a prediction
    response = llm.invoke(input_data)
    
    print("\n","-"*(4 + len(query) ),"\n")
    print(f"QUERY: {query}")
    print("\n","-"*(4 + len(query)*4 ),"\n")
    print(f"Answer: {response.content}")

In [256]:
infer()


 ------------------------------------------------------------------------ 

QUERY: What is the role of the family in traditional marriage in Wassoulou?

 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 

Answer: The role of the family in traditional marriage in Wassoulou is significant and plays a crucial part in the process. In Wassoulou culture, marriage unites two families and not only two individuals. The woman is given to the groom's family, rather than to the groom himself. The ceremonies typically extend over three days, with the main rituals taking place on the second day. On the first evening, young girls and women dance to traditional sounds and music in the bride's family, surrounded by these tents. The next day, the rituals begin with the bride rec

In [176]:
infer()


 --------------------------------------------------------------------------- 

QUERY: What are the traditional steps involved in Zimbabwe customary marriage?

 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ 

Answer: 
Thank you for providing the context! Based on the information provided, here are the traditional steps involved in Zimbabwe customary marriage:

1. **Kukumbira**: The suitor approaches the father of the girl (tezvara) through an intermediator (munyai), who is usually a well-respected old man in the community. The suitor must agree to marry the girl with her informed consent.
2. **Lobola**: The bridegroom's family pays lobola (bride wealth) to the bride's family as a symbol of goodwill and to demonstrate their commitment to the marria

### Loading an LLM locally (**Ollama**)



In [174]:
# Define the model, Ensure this name matches the model you have
local_model = "llama2:7b-chat-q4_0"  

# local_model = "mistral"
llm = ChatOllama(model=local_model)

# Example input in the format expected by the model
input_data =  [
    {"role": "user", "content": "What is the capital of France?"}
]

# Make a prediction
response = llm.invoke(input_data)
response.content

'\nThe capital of France is Paris.'

### Obtained transcript from Youtubes videos

- Upload Json file of videos links
- Get the corresponding transcrptions
- Split into shunk
- Embed the shunk
- Store in Qdrant


In [243]:
from youtube_transcript_api import YouTubeTranscriptApi,  NoTranscriptFound
from deep_translator import GoogleTranslator
from pytube import YouTube

def get_transcription(video_id, target_language):
    max_chars = 4998
    translator = GoogleTranslator(source='auto', target=target_language)

    try:
        # Get the transcript using the YouTubeTranscriptApi
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages_targets)

        # Combine the transcript text
        transcript_text = " ".join([entry['text'] for entry in transcript])
        
        #  GoogleTranslator has a max size limitation to make text translation.
        if len(transcript_text) >= max_chars:
            #  Split into shunk, translate shunk and join them
            text_shunk = split_into_chunks(transcript_text, max_chars) 
            text_shunk_en = [translator.translate(shunk) for shunk in text_shunk]
            transcript_text = " ".join(text_shunk_en)
            
            return transcript_text
        
        # Translate the transcript using GoogleTranslator from deep-translator
        translated_transcript = translator.translate(transcript_text)

        return translated_transcript
    except NoTranscriptFound:
        print(f"No transcript found for video ID: {video_id}")
        return None

# Example usage
# video_url = "dQw4w9WgXcQ" , "UG6y-k59Hps"
translated_text =  get_transcription("UxaQRKVeWnY", 'en')
print(translated_text)


I don't say all in common except at the mit [Music] in the water everyone welcome to all on my youtube channel so we turn today for a new video in which he decided to explain to you the process of traditional marriage in Africa precisely in my country unless the democratic republic of congo in kinshasa does that in africa so I explained to you the dowry in relation to the ceremonies kg eu to attend I want to clarify I am not an expert on dowries I just want to share my experience with you my experience I attended several to several ceremonies doubts when I was very young which means that I still have enough notions in relation to that which is also it is a ceremony that will be special the region of my mother my mother who is from the handling more precisely who is from the mouret tribe guy that is who makes that I have more attend words the judge side blu ray gackou on the side of my father who is my baby but I still know some certain principles of dumas the traditional my baby that I

In [244]:
def transcriptions_shunk_list(json_video_path):
    """
    From a JSON file of video, give a list of shunks
    Parameters:
    - json_video_path: Path to the JSON file.

    Returns:
    - transcription_chunks: List of Shunks.
    """
    
    target_language = 'en'
    translator = GoogleTranslator(source='auto', target=target_language)
    
    # Get array of Youtube videos list
    _, data = load_json_to_dataframe(json_video_path)
    
    # Prepare chunks with metadata
    transcription_chunks = []
    
    # Get the transcript for each videos
    for video in data['videos']:
        
        # Extract the video ID from the URL
        video_url=video['url']
        video_id = video_url.split('v=')[-1]
        
        # Get the full transcript for single video
        translated_text = get_transcription(video_id, target_language=target_language)
        
        if translated_text is None:
            continue  # Skip this video if no transcript is found
        
        print(video['title'], translated_text)
        # Shunk the transtription
        chunks = split_into_chunks(translated_text, max_chunk_size)
            
        for _, chunk in enumerate(chunks):
            transcription_chunks.append({
                'video_id': video_id,
                'title': translator.translate(video['title']),
                'sentence_chunk': chunk,
                'chunk_char_count': len(chunk),
                'chunk_word_count': len(chunk.split()),
                'chunk_token_count': len(chunk) / token_char_ratio
            })
        
    
    return transcription_chunks

json_video_path = "./Transcript_path/Videos.json"
videos_shunk_list = transcriptions_shunk_list(json_video_path)

Le mariage traditionnel dans le Wassoulou par le Programme 'Marketing Territorial Mali' the wasouou marriage is rich in Conie most of the traditional marriage rituals are intended for the woman to remember her great responsibility in the [Music] foyeribala one of the many villages of wasouou belongs to the commune of deorolé in the circle of yfolida the marriages they are generally celebrated in a traditional way for this specific marriage the groom chose to celebrate a civil marriage at the town hall of sucorolé a rarity in the locality [Music] [Music] after the town hall the traditional ritual finally begins in moribala as in many others villages marriage unites two families and not only two individuals the woman is given not to a man but to his family the ceremonies extend over 3 days the main rituals take place on the 2nd day the first evening is dedicated to music in the family of the bride a party where young girls and women dance to traditional sounds and music from the [Music] 

In [245]:
videos_shunk_list

[{'video_id': 'UG6y-k59Hps',
  'title': "Traditional marriage in Wassoulou by the 'Territorial Marketing Mali' Program",
  'sentence_chunk': "the wasouou marriage is rich in Conie most of the traditional marriage rituals are intended for the woman to remember her great responsibility in the [Music] foyeribala one of the many villages of wasouou belongs to the commune of deorolé in the circle of yfolida the marriages they are generally celebrated in a traditional way for this specific marriage the groom chose to celebrate a civil marriage at the town hall of sucorolé a rarity in the locality [Music] [Music] after the town hall the traditional ritual finally begins in moribala as in many others villages marriage unites two families and not only two individuals the woman is given not to a man but to his family the ceremonies extend over 3 days the main rituals take place on the 2nd day the first evening is dedicated to music in the family of the bride a party where young girls and women d

In [249]:
videos_vectors = create_embeddings(embedding_model, videos_shunk_list)

In [250]:
insert_to_qdriant(videos_vectors, videos_shunk_list)

### Using streamlit for Interface connection

In [3]:
import streamlit as st
import streamlit.components.v1 as components

from typing import Literal

from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationSummaryMemory
from langchain.callbacks import get_openai_callback



class Message:
    """Class for keeping track of a chat message."""
    origin: Literal["user", "assistant"]
    message: str
    
def load_css():
    with open("static/styles.css", "r") as f:
        css = f"<style>{f.read()}</style>"
        st.markdown(css, unsafe_allow_html=True)
        
def initialize_session_state():
    if "history" not in st.session_state:
        st.session_state.history = []
    if "token_count" not in st.session_state:
        st.session_state.token_count = 0
    if "conversation" not in st.session_state:

        st.session_state.conversation = ConversationChain(
            llm=llm,
            memory=ConversationSummaryMemory(llm=llm),
        )
        
def on_click_callback():
    with get_openai_callback() as cb:
        human_prompt = st.session_state.human_prompt
        llm_response = st.session_state.conversation.run(
            human_prompt
        )
        st.session_state.history.append(
            Message("human", human_prompt)
        )
        st.session_state.history.append(
            Message("ai", llm_response)
        )
        st.session_state.token_count += cb.total_tokens

load_css()
initialize_session_state()

st.title("Hello Custom CSS Chatbot 🤖")

chat_placeholder = st.container()
prompt_placeholder = st.form("chat-form")
credit_card_placeholder = st.empty()

with chat_placeholder:
    for chat in st.session_state.history:
        div = f"""
<div class="chat-row 
    {'' if chat.origin == 'ai' else 'row-reverse'}">
    <img class="chat-icon" src="app/static/{
        'ai_icon.png' if chat.origin == 'ai' 
                      else 'user_icon.png'}"
         width=32 height=32>
    <div class="chat-bubble
    {'ai-bubble' if chat.origin == 'ai' else 'human-bubble'}">
        &#8203;{chat.message}
    </div>
</div>
        """
        st.markdown(div, unsafe_allow_html=True)
    
    for _ in range(3):
        st.markdown("")

with prompt_placeholder:
    st.markdown("**Chat**")
    cols = st.columns((6, 1))
    cols[0].text_input(
        "Chat",
        value="Hello bot",
        label_visibility="collapsed",
        key="human_prompt",
    )
    cols[1].form_submit_button(
        "Submit", 
        type="primary", 
        on_click=on_click_callback, 
    )

credit_card_placeholder.caption(f"""
Used {st.session_state.token_count} tokens \n
Debug Langchain conversation: 
{st.session_state.conversation.memory.buffer}
""")

components.html("""
<script>
const streamlitDoc = window.parent.document;

const buttons = Array.from(
    streamlitDoc.querySelectorAll('.stButton > button')
);
const submitButton = buttons.find(
    el => el.innerText === 'Submit'
);

streamlitDoc.addEventListener('keydown', function(e) {
    switch (e.key) {
        case 'Enter':
            submitButton.click();
            break;
    }
});
</script>
""", 
    height=0,
    width=0,
)

2024-08-19 14:43:26.719 
  command:

    streamlit run /Users/omer/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-08-19 14:43:26.719 Session state does not function when running a script without `streamlit run`


NameError: name 'llm' is not defined

### Shunk manager class.


- Parent-Child Chunking
In this approach, the text is first split into larger parent chunks, and then each parent chunk is further split into smaller child chunks. Metadata is maintained to preserve the hierarchical relationship.

In [None]:
class ChunkHandler:

    @staticmethod
    def recursive_structure_aware_chunking(text, max_chunk_size):
        """
        Splits text based on recursive structure-aware chunking.
        """
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) > max_chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                current_chunk += " " + sentence
        
        # Add the last chunk
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        return chunks

    @staticmethod
    def sentence_window_parsing(text, window_size=3):
        """
        Splits text into chunks of sentences with overlapping context.
        """
        sentences = sent_tokenize(text)
        chunks = []
        
        for i in range(len(sentences) - window_size + 1):
            chunk = " ".join(sentences[i:i + window_size])
            chunks.append(chunk)
        
        return chunks

    @staticmethod
    def parent_child_chunking(text, max_parent_size, max_child_size):
        """
        Splits text into parent and child chunks.
        """
        # First split into parent chunks
        parent_chunks = ChunkHandler.recursive_structure_aware_chunking(text, max_parent_size)
        child_chunks = []
        
        for parent_chunk in parent_chunks:
            # Further split each parent chunk into child chunks
            children = ChunkHandler.recursive_structure_aware_chunking(parent_chunk, max_child_size)
            child_chunks.extend(children)
        
        return child_chunks

    @staticmethod
    def check_chunk_quality(structured_chunks, min_word_count=30, max_token_count=384):
        """
        Filters out chunks that do not meet the quality criteria.
        """
        filtered_chunks = []

        for chunk in structured_chunks:
            # Check if the chunk meets the quality criteria
            if (chunk['chunk_word_count'] >= min_word_count) and (chunk['chunk_token_count'] <= max_token_count):
                filtered_chunks.append(chunk)

        return filtered_chunks

# # Example usage
# text = "Your long text document here..."
# max_chunk_size = 4999

# # Using the advanced chunking methods directly from the class without instantiation
# structured_chunks = ChunkHandler.recursive_structure_aware_chunking(text, max_chunk_size)
# windowed_chunks = ChunkHandler.sentence_window_parsing(text, window_size=3)
# final_chunks = ChunkHandler.parent_child_chunking(text, max_parent_size=8000, max_child_size=4999)

# # Apply quality check function to filter chunks
# filtered_chunks = ChunkHandler.check_chunk_quality(final_chunks)