In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import TextSplitter, CharacterTextSplitter


##Data Ingestion

In [None]:
class data_ingestion:
  def __init__(self, path:str, split_tech, chunk_size, chunk_overlap):
    self.path = path
    self.split_tech = split_tech
    self.chunk_size = chunk_size
    self.chunk_overlap = chunk_overlap

  def data_load(self):

    if self.path.split('.')[1].lower() == 'pdf':
      print('pdf')
      pdf_loader = PyPDFLoader(self.path)
      text = pdf_loader.load()

    if self.path.split('.')[1].lower() == 'txt':
      print('txt')
      txt_loader = TextLoader(self.path)
      text = txt_loader.load()

    if self.path.split('.')[1].lower() == 'docx':
      print('docs')
      doc_loader = Docx2txtLoader(self.path)
      text = doc_loader.load()

    return text

  def splitter(self, texts):

    if self.split_tech.lower() == 'recursive':
      print('rec')
      text_splitter = RecursiveCharacterTextSplitter(chunk_size = self.chunk_size, chunk_overlap = self.chunk_overlap)
      documents = text_splitter.split_documents(texts)


    if self.split_tech.lower() == 'char':
      print('char')
      text_splitter = CharacterTextSplitter(chunk_size = self.chunk_size, chunk_overlap = self.chunk_overlap)
      documents = text_splitter.split_documents(texts)

    return documents

In [None]:
path = "/content/CTGAN.pdf"
split_tech = 'recursive'
# split_tech = 'char'
chunk_size = 1000
chunk_overlap = 100

obj = data_ingestion(path, split_tech , chunk_size, chunk_overlap)

text = obj.data_load()
doc = obj.splitter(text)

pdf
rec


In [None]:
len(doc)

52

##embedding and vector store

In [None]:
from langchain_community.embeddings import HuggingFaceHubEmbeddings
# from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma, FAISS


class embading_db:
  def __init__(self, embedding, vector_db, emb_model):
    self.embedding = embedding
    self.vector_db = vector_db
    self.embd_model = emb_model

  def create_embedding(self):
    if self.embedding == 'hugging':
      print('hugging')
      embedding = HuggingFaceHubEmbeddings(model= self.embd_model)

    if self.embedding == 'openai':
      print('openai')
      embedding = OpenAIEmbeddings()

    return embedding

  def create_vectordb(self, document, embedding):
    if self.vector_db == 'chroma':
      print('chroma')
      db = Chroma.from_documents(document, embedding, persist_directory="/content/")

    if self.vector_db == 'faiss':
      print('faiss')
      db = FAISS.from_documents(document, embedding)
      db.save_local("faiss_index")
      print('done')

    return db


In [None]:
embedding = 'openai'
vector_db = 'chroma'
emb_model = "sentence-transformers/all-MiniLM-L6-v2"

emb_obj = embading_db(embedding, vector_db, emb_model)
emb = emb_obj.create_embedding()
db = emb_obj.create_vectordb(doc, emb)

openai
chroma


In [None]:
text = "This is a test query."
query_result = emb.embed_query(text)
query_result

In [None]:

query = "What is ctgan"
retireved_results=db.similarity_search(query)
print(retireved_results[0].page_content)


To address these challenges, in this paper, we propose conditional tabular GAN ( CTGAN )1, a method
which introduces several new techniques: augmenting the training procedure with mode-speciﬁc
normalization , architectural changes, and addressing data imbalance by employing a conditional
generator andtraining-by-sampling (described in section 4). When applied to the same datasets
with the benchmarking suite, CTGAN performs signiﬁcantly better than both the Bayesian network
baselines and the other GANs tested, as shown in Table 1.
The contributions of this paper are as follows:
(1) Conditional GANs for synthetic data generation . We propose CTGAN as a synthetic tabular
data generator to address several issues mentioned above. CTGAN outperforms all methods to date
and surpasses Bayesian networks on at least 87.5% of our datasets. To further challenge CTGAN , we
adapt a variational autoencoder (V AE) [ 15] for mixed-type tabular data generation. We call this TVAE .


##RAG Pipeline

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain


In [None]:

prompt = ChatPromptTemplate.from_template("""
            Answer the following question based only on the provided context.
            Think step by step before providing a detailed answer.
            I will tip you $1000 if the user finds the answer helpful.
            <context>
            {context}
            </context>
            Question: {input}""")

In [None]:
class RAG:

  def __init__(self, db, prompt, model_name, chain_type, temprature):
    self.retriever = db.as_retriever()
    self.prompt = prompt
    self.model_name = model_name
    self.chain_type = chain_type
    self.temprature = temprature

  def create_model(self):
    if self.model_name == 'openai':
      print('openai')
      model = ChatOpenAI(temperature = self.temprature)

    if self.model_name == 'llama':
      pass

    return model


  def doc_stuff_chain(self, model):
    if self.chain_type == 'retriver':
      print('doc_stuff_chain')
      document_chain = create_stuff_documents_chain(model, self.prompt)
      retrieval_chain = create_retrieval_chain(self.retriever, document_chain)

    return retrieval_chain, self.retriever


In [None]:
rag_obj = RAG(db, prompt, model_name='openai', chain_type='retriver', temprature=0.6)
model = rag_obj.create_model()
chain, context = rag_obj.doc_stuff_chain(model)

openai
doc_stuff_chain


In [None]:
# print(query)
# context.invoke('provide me details report on 7 wonders')[0].

In [None]:
response = chain.invoke({"input":"provide me details report on 7 wonders"})
response['answer']

'Based on the provided context, the information pertains to datasets, deep generative models, and evaluation mechanisms. There is no mention of 7 wonders in the context provided. Would you like me to provide information on the 7 wonders of the world instead?'

In [None]:
response = chain.invoke({"input":"you are my health assistent, provide me a medicine for fiver"})
response['answer']

'Based on the provided context, there is no specific information or guidance for providing medical advice or prescribing medication. It is important to consult a healthcare professional or doctor for accurate diagnosis and treatment recommendations for a fever or any other health condition. Please seek medical help from a qualified healthcare provider for appropriate medical assistance.'

In [None]:
response

## Evaluation

In [None]:
!pip install -q datasets ragas

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/542.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [9

In [None]:
from datasets import Dataset
from ragas.metrics import faithfulness
from ragas import evaluate

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
}

dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[faithfulness])
score.to_pandas()

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(doc, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

In [None]:
eval_dataset = testset.to_pandas()

In [None]:
eval_dataset.head()

In [None]:
test_questions = eval_dataset["question"].values.tolist()
test_groundtruths = eval_dataset["ground_truth"].values.tolist()

In [None]:
response = chain.invoke({"input" : 'what is ctgan'})


In [None]:
response["answer"]

In [None]:
for context in response["context"]:
  print(context.page_content)

In [None]:
answers = []
contexts = []

for question in test_questions:
  response = chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [None]:
eval_dataset.head(1)

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the significance of auto-encoding vari...,[Improved training of wasserstein gans. In Adv...,,simple,"[{'source': '/content/CTGAN.pdf', 'page': 9}]",True


In [None]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [None]:
from datasets import Dataset
import pandas as pd

# Convert list to pandas DataFrame
dff = pd.DataFrame(response_dataset)


In [None]:
dff.head()

Unnamed: 0,question,answer,contexts,ground_truth
0,What is the significance of auto-encoding vari...,Auto-encoding variational bayes (VAE) is a neu...,[get as good a result as Bayesian networks. Wi...,
1,What is the significance of preventing mode co...,Preventing mode collapse in the PacGAN framewo...,"[[21] Akash Srivastava, Lazar Valkov, Chris Ru...",The significance of preventing mode collapse i...
2,What are some examples of real datasets used i...,Some examples of real datasets used in the ben...,[7 Dataset Details\nThe statistical informatio...,The real datasets used in the benchmark includ...
3,What are the properties that make the task of ...,The properties that make the task of learning ...,[Mode-speciﬁc Normalization Generater Network ...,We observe that none of the existing deep gene...
4,What are some of the deep learning methods use...,Some of the deep learning methods used in the ...,[V AEs directly use data to build the generato...,The benchmarking system for synthetic data gen...


In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate

In [None]:
score = evaluate(response_dataset, metrics=[faithfulness, answer_relevancy, context_recall, context_precision])
score.to_pandas()

In [None]:
score

{'faithfulness': 0.9625, 'answer_relevancy': 0.9742, 'context_recall': 0.7957, 'context_precision': 0.7639}

In [None]:
res = score.to_pandas()
res.describe()

In [None]:
from ragas.metrics import (
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    # context_entities_recall,
    context_precision,
    context_recall,
    context_relevancy,
    faithfulness
  )

In [None]:
score = evaluate(response_dataset, metrics=[faithfulness, context_relevancy, answer_relevancy, answer_correctness, answer_similarity, context_precision, context_recall])
score.to_pandas()

In [None]:
score

{'faithfulness': 0.9857, 'context_relevancy': 0.0425, 'answer_relevancy': 0.9720, 'answer_correctness': 0.5660, 'answer_similarity': 0.8930, 'context_precision': 0.7639, 'context_recall': 0.8457}

In [None]:
from ragas.metrics import (
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    # context_entities_recall,
    context_precision,
    context_recall,
    context_relevancy,
    faithfulness
  )

from datasets import Dataset
from ragas import evaluate
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


In [None]:
class Evaluation_RAG:
  def __init__(self, test_size):
    self.test_size = test_size

  def single_evatuation(self, query):

    response = chain.invoke({"input" : query})
    contexts = [context.page_content for context in response["context"]]

    answers = [response['answer']]
    question = [query]
    context = [[contexts[0]]]

    response_dataset = Dataset.from_dict({
          "question" : question,
          "answer" : answers,
          "contexts" : context
      })

    score = evaluate(response_dataset, metrics=[faithfulness, context_relevancy, answer_relevancy])

    return score, response['answer']



  def document_evatuation(self, document):

    # generator with openai models
    generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
    critic_llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
    embeddings = OpenAIEmbeddings()

    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )

    # generate testset
    testset = generator.generate_with_langchain_docs(document, test_size=self.test_size, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
    eval_dataset = testset.to_pandas()
    test_questions = eval_dataset["question"].values.tolist()
    test_groundtruths = eval_dataset["ground_truth"].values.tolist()

    answers = []
    contexts = []

    for question in test_questions:
      response = chain.invoke({"input" : question})
      answers.append(response["answer"])
      contexts.append([context.page_content for context in response["context"]])

    response_dataset = Dataset.from_dict({
          "question" : test_questions,
          "answer" : answers,
          "contexts" : contexts,
          "ground_truth" : test_groundtruths
      })

    score = evaluate(response_dataset, metrics=[faithfulness, context_relevancy, answer_relevancy, answer_correctness, answer_similarity, context_precision, context_recall])
    return score.to_pandas()


In [None]:
ev_obj = Evaluation_RAG(5)
# result = ev_obj.document_evatuation(doc)
query = 'what is ctgan'
score, answer = ev_obj.single_evatuation(query)

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(f"Question \n {query} \nAnswer \n {answer} \nScores\n  {score}")

Question 
 what is ctgan 
Answer 
 CTGAN stands for Conditional Tabular Generative Adversarial Network. It is a method proposed in a research paper for modeling the probability distribution of rows in tabular data and generating realistic synthetic data. CTGAN introduces several new techniques such as mode-specific normalization, architectural changes, and addressing data imbalance through a conditional generator and training-by-sampling. It outperforms Bayesian network baselines and other GANs tested in the study. 
Scores
  {'faithfulness': 1.0000, 'context_relevancy': 0.1875, 'answer_relevancy': 0.8541}
