In [None]:
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from ctransformers import  AutoModelForCausalLM

In [None]:
documents=SimpleDirectoryReader("./docs").load_data()
documents

In [None]:
import json
with open('../prompts.json', 'r') as f:
    loader = json.load(f)
    system_prompt = loader['Default']
## Default format supportable by LLama2

#query_wrapper_prompt = SimpleInputPrompt("[INST] <<SYS>>{query_str}<</SYS>> \n {system_prompt}[/INST]")
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

HuggingFace_Api = os.environ.get('HF_TOKEN')

In [None]:
import torch
print(torch.cuda.get_device_name())

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": True},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="../meta",
    model_name="../meta",
    device_map="cuda",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16,"load_in_8bit":True }
)

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding

embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

In [None]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
service_context

In [None]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)
index

In [None]:
query_engine=index.as_query_engine()
response=query_engine.query("what is this PDF tells about?")
out = response
print(response)

In [None]:
gg = "This PDF provides an introduction to Python programming language, covering basic concepts, syntax, performance, and application areas. It also provides examples to illustrate the concepts and encourage students to learn more."

In [None]:
from elevenlabs.client import ElevenLabs
client = ElevenLabs(api_key=os.environ.get('ELEVENLABS_TOKEN'))

In [None]:
from elevenlabs import generate, play
audio = generate(
    text = out,
    voice = 'Rachel',
    model = 'eleven_multilingual_v2'
)

play(audio)

In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./docs/python_tutorial.pdf')
loader.load_and_split()

[Document(page_content="Python Tutorial  \n        \n    i \n \nAbout the Tutorial  \nToday, Python is one of the most popular programming languages. Although it is a general -\npurpose language, it is used in various areas of applications such as Machine Learning, \nArtificial Intelligence, web development, IoT, and more. This Python tutorial  is designed \nto be a self -learning guide for beginners, students looking for a career in software \ndevelopment and Data science. This tutorial shall also be useful for experienced software \nprofessionals to enhance their skills.  \nThis Python tutorial is based o n the latest Python 3.11.2 version.   \nWhat is Python?  \nPython  is a very popular general -purpose interpreted, interactive, object -oriented, and \nhigh-level programming language. Python is dynamically -typed and garbage -collected \nprogramming language. It was cre ated by Guido van Rossum during 1985 - 1990. Like \nPerl, Python source code is also available under the GNU Gene

In [1]:
# Load model directly
from transformers import AutoTokenizer, BitsAndBytesConfig, LlamaForCausalLM
import torch

quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )

tokenizer = AutoTokenizer.from_pretrained("../meta")
model = LlamaForCausalLM.from_pretrained("../meta", quantization_config=quantization_config)

  from .autonotebook import tqdm as notebook_tqdm
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:57<00:00, 28.73s/it]
