In [16]:
%pip install PyPDF2
%pip install nltk
%pip install sklearn
%pip install openai
%pip install scikit-learn
%pip install streamlit python-dotenv PyPDF2 langchain-openai langchain-community 
%pip install transformers torch
%pip install nltk rouge-score

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
 

: 

In [None]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')

# Load environment variables
load_dotenv()

# Path to the PDF file
pdf_path = "path/Sample.pdf"
# Load API key from environment variables
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [11]:


def get_pdf_text(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def get_text_chunks(text):
    """Split text into manageable chunks."""
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=3000,  # Increase chunk size
        chunk_overlap=1000,  # Increase overlap
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks):
    """Create a vector store from text chunks using embeddings."""
    embeddings = OpenAIEmbeddings()
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

def get_conversation_chain(vectorstore):
    """Set up the conversational chain with memory."""
    llm = ChatOpenAI(temperature=0)
    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

def get_embedding(text, tokenizer, model):
    """Get BERT embedding for a given text."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def evaluate_model(conversation_chain, questions, true_answers):
    """Evaluate the model's performance using semantic similarity."""
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    predicted_answers = []
    for question in questions:
        response = conversation_chain({'question': question})
        print(f"Question: {question}")
        print(f"Response: {response}\n")
        if 'answer' in response:
            predicted_answers.append(response['answer'])
        elif 'text' in response:
            predicted_answers.append(response['text'])
        else:
            predicted_answers.append("")

    # Calculate cosine similarity between true and predicted answers
    similarities = []
    for true, pred in zip(true_answers, predicted_answers):
        true_embedding = get_embedding(true, tokenizer, model)
        pred_embedding = get_embedding(pred, tokenizer, model)
        similarity = cosine_similarity(true_embedding, pred_embedding)[0][0]
        similarities.append(similarity)

    # Print average similarity
    avg_similarity = sum(similarities) / len(similarities)
    print("Evaluation Metrics:")
    print(f"Average Cosine Similarity: {avg_similarity:.2f}")

# Extract text from the PDF
raw_text = get_pdf_text(pdf_path)

# Split text into chunks
text_chunks = get_text_chunks(raw_text)

# Create vector store from text chunks
vectorstore = get_vectorstore(text_chunks)

# Create conversation chain
conversation_chain = get_conversation_chain(vectorstore)

# Ask a question
question = "What is the purpose of this document?"
response = conversation_chain({'question': question})
print(f"Question: {question}")
print(f"Response: {response}\n")

# Example evaluation (replace with actual questions and answers)
sample_questions = ["What is the purpose of this document?"]
true_answers = ["The purpose of this document is to present BloombergGPT, a large language model for finance, and validate its performance on various benchmarks."]  # Replace with actual expected answer
evaluate_model(conversation_chain, sample_questions, true_answers)


Question: What is the purpose of this document?
Response: {'question': 'What is the purpose of this document?', 'chat_history': [HumanMessage(content='What is the purpose of this document?'), AIMessage(content='The purpose of the document is to provide information about the training corpus used for language models, specifically focusing on financial documents like company filings, press releases, and Bloomberg news. It also discusses the evaluation process for language models and the importance of domain-specific evaluations.')], 'answer': 'The purpose of the document is to provide information about the training corpus used for language models, specifically focusing on financial documents like company filings, press releases, and Bloomberg news. It also discusses the evaluation process for language models and the importance of domain-specific evaluations.'}

Question: What is the purpose of this document?
Response: {'question': 'What is the purpose of this document?', 'chat_history': [

In [12]:

def get_pdf_text(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def get_text_chunks(text):
    """Split text into manageable chunks."""
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=3000,  # Increase chunk size
        chunk_overlap=1000,  # Increase overlap
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vectorstore(text_chunks):
    """Create a vector store from text chunks using embeddings."""
    embeddings = OpenAIEmbeddings()
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

def get_conversation_chain(vectorstore):
    """Set up the conversational chain with memory."""
    llm = ChatOpenAI(temperature=0)
    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

def get_embedding(text, tokenizer, model):
    """Get BERT embedding for a given text."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def evaluate_model(conversation_chain, questions, true_answers):
    """Evaluate the model's performance using semantic similarity, BLEU, ROUGE, and METEOR."""
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    predicted_answers = []
    for question in questions:
        response = conversation_chain({'question': question})
        print(f"Question: {question}")
        print(f"Response: {response}\n")
        if 'answer' in response:
            predicted_answers.append(response['answer'])
        elif 'text' in response:
            predicted_answers.append(response['text'])
        else:
            predicted_answers.append("")

    # Calculate cosine similarity between true and predicted answers
    similarities = []
    for true, pred in zip(true_answers, predicted_answers):
        true_embedding = get_embedding(true, tokenizer, model)
        pred_embedding = get_embedding(pred, tokenizer, model)
        similarity = cosine_similarity(true_embedding, pred_embedding)[0][0]
        similarities.append(similarity)

    # Calculate average cosine similarity
    avg_similarity = sum(similarities) / len(similarities)

    # Initialize metrics
    bleu_scores = []
    meteor_scores = []
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    for true, pred in zip(true_answers, predicted_answers):
        # Tokenize true and predicted answers
        true_tokens = word_tokenize(true)
        pred_tokens = word_tokenize(pred)
        
        # BLEU score
        bleu_scores.append(sentence_bleu([true_tokens], pred_tokens))
        
        # METEOR score
        meteor_scores.append(meteor_score([true_tokens], pred_tokens))
        
        # ROUGE score
        rouge = rouge_scorer_obj.score(true, pred)
        for key in rouge_scores:
            rouge_scores[key].append(rouge[key].fmeasure)

    # Calculate average scores
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    avg_rouge = {key: sum(values) / len(values) for key, values in rouge_scores.items()}

    # Print evaluation metrics
    print("Evaluation Metrics:")
    print(f"Average Cosine Similarity: {avg_similarity:.2f}")
    print(f"Average METEOR Score: {avg_meteor:.2f}")
    print(f"Average ROUGE Scores: {avg_rouge}")


# Extract text from the PDF
raw_text = get_pdf_text(pdf_path)

# Split text into chunks
text_chunks = get_text_chunks(raw_text)

# Create vector store from text chunks
vectorstore = get_vectorstore(text_chunks)

# Create conversation chain
conversation_chain = get_conversation_chain(vectorstore)

# Ask a question
question = "What is the purpose of this document?"
response = conversation_chain({'question': question})
print(f"Question: {question}")
print(f"Response: {response}\n")

# Example evaluation (replace with actual questions and answers)
sample_questions = ["What is the purpose of this document?"]
true_answers = ["The purpose of this document is to present BloombergGPT, a large language model for finance, and validate its performance on various benchmarks."]  # Replace with actual expected answer
evaluate_model(conversation_chain, sample_questions, true_answers)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/reeyadav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Question: What is the purpose of this document?
Response: {'question': 'What is the purpose of this document?', 'chat_history': [HumanMessage(content='What is the purpose of this document?'), AIMessage(content='The purpose of the document is to provide information about the training corpus used for language models, specifically focusing on financial documents like company filings, press releases, and Bloomberg news. It also discusses the evaluation process for language models and the importance of domain-specific evaluations.')], 'answer': 'The purpose of the document is to provide information about the training corpus used for language models, specifically focusing on financial documents like company filings, press releases, and Bloomberg news. It also discusses the evaluation process for language models and the importance of domain-specific evaluations.'}

Question: What is the purpose of this document?
Response: {'question': 'What is the purpose of this document?', 'chat_history': [

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
