In [1]:
import os
from dotenv import load_dotenv, find_dotenv # type: ignore
_ = load_dotenv(find_dotenv())

groq_api_key = os.environ["GROQ_API_KEY"]

In [2]:
import os
os.chdir("..")
data_dir = os.path.join(os.getcwd(), "data")
# print(data_dir)

In [3]:
from langchain_community.document_loaders import TextLoader # type: ignore

try:
    loader = TextLoader("data/Sachin_wikipedia.txt", encoding='utf-8')
    text = loader.load()
    print(text)
except UnicodeDecodeError as e:
    print(f"Error reading the file: {e}")
except FileNotFoundError as e:
    print(f"File not found: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# text

[Document(metadata={'source': 'data/Sachin_wikipedia.txt'}, page_content='Sachin Ramesh Tendulkar (/ˌsʌtʃɪn tɛnˈduːlkər/ ⓘ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket.[5] Hailed as the world\'s most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively.[6] He also holds the record for receiving the most player of the match awards in international cricket.[7] Tendulkar was a Member of Parliament, Rajya Sabha by presidential nomination from 2012 to 2018.[8][9]\n\nTendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years.[10] In 2002, halfway throug

In [4]:
text[0].page_content

'Sachin Ramesh Tendulkar (/ˌsʌtʃɪn tɛnˈduːlkər/ ⓘ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket.[5] Hailed as the world\'s most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively.[6] He also holds the record for receiving the most player of the match awards in international cricket.[7] Tendulkar was a Member of Parliament, Rajya Sabha by presidential nomination from 2012 to 2018.[8][9]\n\nTendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years.[10] In 2002, halfway through his career, Wisden ranked him the second-greatest Test batsman of all t

In [None]:
from langchain_text_splitters import CharacterTextSplitter # type: ignore

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [6]:
texts = text_splitter.create_documents([text[0].page_content])
texts

Created a chunk of size 1190, which is longer than the specified 1000
Created a chunk of size 1365, which is longer than the specified 1000


[Document(page_content="Sachin Ramesh Tendulkar (/ˌsʌtʃɪn tɛnˈduːlkər/ ⓘ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket.[5] Hailed as the world's most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively.[6] He also holds the record for receiving the most player of the match awards in international cricket.[7] Tendulkar was a Member of Parliament, Rajya Sabha by presidential nomination from 2012 to 2018.[8][9]"),
 Document(page_content='Tendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years.[10] In 2002, halfway through his career, Wisden ranked

In [7]:
len(texts)

22

In [8]:
texts[1]

Document(page_content='Tendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years.[10] In 2002, halfway through his career, Wisden ranked him the second-greatest Test batsman of all time, behind Don Bradman, and the second-greatest ODI batsman of all time, behind Viv Richards.[11] The same year, Tendulkar was a part of the team that was one of the joint-winners of the 2002 ICC Champions Trophy. Later in his career, Tendulkar was part of the Indian team that won the 2011 Cricket World Cup, his first win in six World Cup appearances for India.[12] He had previously been named "Player of the Tournament" at the 2003 World Cup.')

In [None]:
# Recursive CharacterTextSplitter

from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore

recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
)

recursive_text = recursive_text_splitter.create_documents([text[0].page_content])
recursive_text 

[Document(page_content="Sachin Ramesh Tendulkar (/ˌsʌtʃɪn tɛnˈduːlkər/ ⓘ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket.[5] Hailed as the world's most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively.[6] He also holds the record for receiving the most player of the match awards in international cricket.[7] Tendulkar was a Member of Parliament, Rajya Sabha by presidential nomination from 2012 to 2018.[8][9]"),
 Document(page_content='Tendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years.[10] In 2002, halfway through his career, Wisden ranked

In [10]:
len(recursive_text)

23

In [20]:
from openai import OpenAI # type: ignore
from dotenv import load_dotenv, find_dotenv # type: ignore
_ = load_dotenv(find_dotenv())

openai_api_key = os.environ["OPENAI_API_KEY"]


In [21]:
model = OpenAI(api_key=openai_api_key)

### Embeddings

In [22]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()

In [23]:
chunks_text = [
    "Hi there, how are you?",
    "I am doing well, thank you for asking.",
    "What's your name?"
]

In [24]:
embeddings_model = embeddings_model.embed_documents(chunks_text)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
embeddings_model