In [2]:
# importing libraries 

#from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig
#from Chatting_with_docs_via_LLMs.utils import *
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
import transformers
import PyPDF2
import pandas as pd 
from langchain.vectorstores import Chroma
from langchain.chat_models import AzureChatOpenAI



In [3]:
#Reading a PDF: It defines a function read_pdf_with_pypdf2 that opens a PDF file and extracts its text using PyPDF2. This text is concatenated into a single string, returned at the end of the function.

#Processing the text: After reading the PDF ("The-Field-Guide-to-Data-Science.pdf"), it splits the extracted text into smaller chunks using RecursiveCharacterTextSplitter, which breaks the text based on a specified chunk size and overlap.

#Document wrapping: Each chunk is then converted into a Document object, preparing it for further processing.

#Embedding generation: Using the HuggingFace's transformers, it loads a pre-trained model (sentence-transformers/all-MiniLM-L6-v2) to generate normalized embeddings for the chunks of text.

#Database creation: Constructs a Chroma database from the documents with their embeddings. This database is used to store and manage the text data in a structured format.

#Retriever initialization: Initializes a retriever on the created database to facilitate similarity search, allowing the retrieval of the top k (5) most similar documents based on the embeddings.


def read_pdf_with_pypdf2(file_path):
    text = ''
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + '\n'  
    
    return text

pdf_path = "The-Field-Guide-to-Data-Science.pdf"
text = read_pdf_with_pypdf2(pdf_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    length_function=len)

chunks = text_splitter.split_text(text=text)
new_chunks = [Document(page_content=chunk) for chunk in chunks]


model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



db = Chroma.from_documents(
    documents=new_chunks, 
    embedding=embeddings
)


retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 5}
)

In [4]:
# Configuration loading: Loads the configuration for the specified model_name using AutoConfig.from_pretrained. This configuration adapts the model to the specific requirements and settings.

# Tokenizer setup: Initializes the tokenizer with AutoTokenizer.from_pretrained, setting it to trust remote code. The pad token is then set to be the same as the end-of-sentence (EOS) token, ensuring consistent token padding during text generation.

# Model instantiation: Loads a pre-trained causal language model (for generating text) using AutoModelForCausalLM.from_pretrained and moves it to the CUDA device for GPU-accelerated computation.

# Pipeline creation: Establishes a text-generation pipeline with the loaded model and tokenizer, setting various parameters like temperature, repetition_penalty, and max_new_tokens to control the generation process. do_sample is set to True for probabilistic token sampling, and the device is set to -1 (typically meaning CPU).

# Wrapper object creation: Wraps the configured pipeline into a HuggingFacePipeline object, which provides a convenient interface for interacting with the model.

# Return the model wrapper: Returns the llm_model, which is the HuggingFacePipeline object ready for generating text based on the input model and configuration.

def create_model(model_name,hf_token=None,token_size = 6144):
    
    model_config = AutoConfig.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(model_name)
    model = model.to('cuda') 


    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.1,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=token_size,
        do_sample=True,
        device=-1  
    )

    llm_model = HuggingFacePipeline(pipeline=text_generation_pipeline)
    
    return llm_model

In [5]:
# Template setup: Defines a template for the QA task instructing the assistant to use provided context to answer questions concisely within five sentences.

# Prompt initialization: Creates a ChatPromptTemplate object from the defined template, which formats incoming questions and context into a consistent structure for processing.

# Retrieval-augmented generation (RAG) chain: Sets up a processing chain starting with context retrieval (retriever), followed by question parsing (RunnablePassthrough()), text generation using the llm_model, and finally parsing the output into a structured format (StrOutputParser()).

# Question definition: Lists specific questions related to Data Science concepts as outlined in a guide, which will be used to test the QA process.

# Answer generation: Iterates through the defined questions, invoking the RAG chain for each question to generate answers, which are then stored in an answers list.

# Response compilation: Creates a dictionary llm_response containing both the questions and their corresponding answers.

# Return the result: Returns lll_response, which includes the questions posed and the answers generated by the language model, demonstrating the QA capability of the system.

def chain_and_QA_process(llm_model):
    
    template = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Use five sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:
    """
    prompt = ChatPromptTemplate.from_template(template)
    
    rag_chain = (
        {"context": retriever,  "question": RunnablePassthrough()} 
        | prompt 
        | llm_model
        | StrOutputParser() 
    )

    query = ["What are the fundamental differences between deductive and inductive reasoning in the context of Data Science, as outlined in the guide?",\
             "How does the guide describe the transformation of data into actionable insights through the creation of data products?",\
             "According to the guide, what role does the Data Lake play in the preparation and analysis of data for Data Science endeavors?",\
             "Can you explain the concept of 'Data Science Maturity' within an organization as presented in the guide, and how does it impact the organization's analytical capabilities?",\
             "The guide mentions a 'Data Science Venn Diagram' that includes domain expertise, computer science, and mathematics. How does this diagram illustrate the interdisciplinary nature of Data Science, and why are these areas critical?"]

    answers = []

    for i in query:
        answers.append(rag_chain.invoke(i))

    llm_response = {"Questions":query,\
                   "Answers":answers}

    return llm_response

# Now we are going to use 4 different model for RAG.

## *Model Llama-2-13b-chat*

In [8]:
from huggingface_hub import login

huggingface_token = "huggingface_token"

login(huggingface_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /opt/app-root/src/PV/cache/token
Login successful


In [9]:
model_name='meta-llama/Llama-2-13b-chat-hf'

llama_2_13b = create_model(model_name)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
llm_response = chain_and_QA_process(llama_2_13b)

for i,j in zip(llm_response["Questions"],llm_response["Answers"]):
    print(f"Question: {i}\nAnswer: {j}\n")

  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)


Question: What are the fundamental differences between deductive and inductive reasoning in the context of Data Science, as outlined in the guide?
Answer:  The fundamental differences between deductive and inductive reasoning in the context of Data Science are:
     * Deductive reasoning involves reasoning from known premises to a certain conclusion, while inductive reasoning involves drawing uncertain inferences based on probabilistic reasoning.
     * Data Science combines both deductive and inductive reasoning to create an environment where models of reality are constantly tested, updated, and improved until better models are found.
     * Data Science emphasizes the use of inductive reasoning to discover new relationships and insights from the data, while deductive reasoning is used to formulate hypotheses and carry out experiments to test those hypotheses.

Question: How does the guide describe the transformation of data into actionable insights through the creation of data produc

In [13]:
df_results = pd.DataFrame(llm_response)
df_results["llm-model"] = "Llama-2-13b-chat-hf"
df_results = df_results[["llm-model","Questions","Answers"]]
df_results

Unnamed: 0,llm-model,Questions,Answers
0,Llama-2-13b-chat-hf,What are the fundamental differences between d...,The fundamental differences between deductive...
1,Llama-2-13b-chat-hf,How does the guide describe the transformation...,"Based on the provided context, the guide desc..."
2,Llama-2-13b-chat-hf,"According to the guide, what role does the Dat...",The Data Lake plays a crucial role in the pre...
3,Llama-2-13b-chat-hf,Can you explain the concept of 'Data Science M...,"The concept of ""Data Science Maturity"" refers..."
4,Llama-2-13b-chat-hf,The guide mentions a 'Data Science Venn Diagra...,The Data Science Venn Diagram illustrates the...


## *Model Mistral-7B-Instruct-v0.2*

In [11]:
model_name='mistralai/Mistral-7B-Instruct-v0.2'

mistral_llm = create_model(model_name)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [7]:
llm_response = chain_and_QA_process(mistral_llm)

for i,j in zip(llm_response["Questions"],llm_response["Answers"]):
    print(f"Question: {i}\nAnswer: {j}\n")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Question: What are the fundamental differences between deductive and inductive reasoning in the context of Data Science, as outlined in the guide?
Answer: 1. Deductive reasoning is commonly associated with formal logic, involving reasoning from known premises to a certain conclusion, while inductive reasoning is commonly known as informal logic or everyday argument, involving drawing uncertain inferences based on probabilistic reasoning.
    2. In Data Science, both deductive and inductive reasoning play important roles. Deductive reasoning is used to formulate hypotheses about relationships and underlying models, while inductive reasoning is used for exploratory data analysis to discover or refine hypotheses and discover new relationships, insights, and analytic paths from the data.
    3. Data Science creates an environment where models of reality are constantly tested, updated, and improved until better models are found, unlike traditional analytic approaches where models are static

In [14]:
pd.DataFrame(llm_response)

df_results = pd.DataFrame(llm_response)
df_results["llm-model"] = "Mistral-7B-Instruct-v0.2"
df_results = df_results[["llm-model","Questions","Answers"]]
df_results

Unnamed: 0,llm-model,Questions,Answers
0,Mistral-7B-Instruct-v0.2,What are the fundamental differences between d...,1. Deductive reasoning is commonly associated ...
1,Mistral-7B-Instruct-v0.2,How does the guide describe the transformation...,1. Data Science is described as the art of tur...
2,Mistral-7B-Instruct-v0.2,"According to the guide, what role does the Dat...",1. The Data Lake is a tool used by Data Scient...
3,Mistral-7B-Instruct-v0.2,Can you explain the concept of 'Data Science M...,1. Data Science Maturity refers to the progres...
4,Mistral-7B-Instruct-v0.2,The guide mentions a 'Data Science Venn Diagra...,1. The Data Science Venn Diagram represents th...


## *Model LargeWorldModel-LWM-Text-Chat-256K*

In [6]:
model_name='LargeWorldModel/LWM-Text-Chat-256K'

lwm_llm = create_model(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
llm_response = chain_and_QA_process(lwm_llm)

for i,j in zip(llm_response["Questions"],llm_response["Answers"]):
    print(f"Question: {i}\nAnswer: {j}\n")

  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)


Question: What are the fundamental differences between deductive and inductive reasoning in the context of Data Science, as outlined in the guide?
Answer: 1. The fundamental difference between deductive and inductive reasoning in the context of Data Science is that deductive reasoning involves reasoning from known premises to a certain conclusion, while inductive reasoning involves drawing uncertain inferences based on probabilistic reasoning.
    2. Data Science combines both deductive and inductive reasoning to create a more comprehensive understanding of data and develop better models of reality.
    3. The role of deductive reasoning in Data Science is to formulate hypotheses about relationships and underlying models, carry out experiments with the data to test hypotheses and models, and refine existing models based on new insights gained from the data.
    4. The role of inductive reasoning in Data Science is to explore the data, discover new relationships and insights, and create

In [8]:
pd.DataFrame(llm_response)

df_results = pd.DataFrame(llm_response)
df_results["llm-model"] = "LargeWorldModel-LWM-Text-Chat-256K"
df_results = df_results[["llm-model","Questions","Answers"]]
df_results

Unnamed: 0,llm-model,Questions,Answers
0,LargeWorldModel-LWM-Text-Chat-256K,What are the fundamental differences between d...,1. The fundamental difference between deductiv...
1,LargeWorldModel-LWM-Text-Chat-256K,How does the guide describe the transformation...,1. The guide describes Data Science as the art...
2,LargeWorldModel-LWM-Text-Chat-256K,"According to the guide, what role does the Dat...",The Data Lake serves as a central repository ...
3,LargeWorldModel-LWM-Text-Chat-256K,Can you explain the concept of 'Data Science M...,1. The concept of 'Data Science Maturity' refe...
4,LargeWorldModel-LWM-Text-Chat-256K,The guide mentions a 'Data Science Venn Diagra...,1. The Data Science Venn Diagram illustrates t...


## *Model AzureOpenai-GPT-35-turbo-16k*

In [19]:
os.environ['OPENAI_API_KEY'] = 'OPENAI_API_KEY'
os.environ['OPENAI_API_TYPE'] = 'azure'
os.environ['OPENAI_API_VERSION'] = '2023-03-15-preview'
os.environ['OPENAI_API_BASE'] = 'OPENAI_API_BASE'

llm = AzureChatOpenAI(
      deployment_name="deployment_name",
      model_name="gpt-35-turbo-16k")



In [17]:
llm_response = chain_and_QA_process(llm)

for i,j in zip(llm_response["Questions"],llm_response["Answers"]):
    print(f"Question: {i}\nAnswer: {j}\n")

Question: What are the fundamental differences between deductive and inductive reasoning in the context of Data Science, as outlined in the guide?
Answer: The fundamental differences between deductive and inductive reasoning in the context of Data Science, as outlined in the guide, are that deductive reasoning involves reasoning from known premises to a certain conclusion, while inductive reasoning involves drawing uncertain inferences based on probabilistic reasoning. Deductive reasoning is commonly associated with "formal logic" and produces certain and inevitable conclusions, while inductive reasoning is commonly known as "informal logic" and produces probable and reasonable conclusions. Data Science encourages shifting between deductive and inductive reasoning, allowing for the formulation and testing of hypotheses as well as exploratory data analysis to discover new insights and relationships. Data Science also supports the creation of constantly tested, updated, and improved mode

In [18]:
pd.DataFrame(llm_response)

df_results = pd.DataFrame(llm_response)
df_results["llm-model"] = "AzureOpenai-GPT-35-turbo-16k"
df_results = df_results[["llm-model","Questions","Answers"]]
df_results

Unnamed: 0,llm-model,Questions,Answers
0,AzureOpenai-GPT-35-turbo-16k,What are the fundamental differences between d...,The fundamental differences between deductive ...
1,AzureOpenai-GPT-35-turbo-16k,How does the guide describe the transformation...,The guide describes the transformation of data...
2,AzureOpenai-GPT-35-turbo-16k,"According to the guide, what role does the Dat...",The Data Lake plays a role in the preparation ...
3,AzureOpenai-GPT-35-turbo-16k,Can you explain the concept of 'Data Science M...,The concept of 'Data Science Maturity' within ...
4,AzureOpenai-GPT-35-turbo-16k,The guide mentions a 'Data Science Venn Diagra...,The Data Science Venn Diagram illustrates the ...
