In [18]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
import json

In [19]:
with open('key.json') as file:
    api_keys = json.load(file)
with open("docs.txt", "r") as file:
    documents=file.read()

In [20]:
GOOGLE_API_KEY=api_keys['GOOGLE_API_KEY']

In [21]:
genai.configure()
llm = genai.GenerativeModel("gemini-pro")

In [22]:
def indexing(documents,chunk_size=100,chunk_overlap=10):
    documents=documents.split()
    documents_chunk=[]
    for i in range(0,len(documents)+chunk_size,chunk_size):
        if i==0:
            documents_chunk.append(' '.join(documents[i:i+chunk_size]))
        else:
            documents_chunk.append(' '.join(documents[i-chunk_overlap:i+chunk_size]))
    documents_chunk.remove('')
    vectorizer = TfidfVectorizer(max_features=110)
    tfidf_matrix = vectorizer.fit_transform(documents_chunk)
    return vectorizer,tfidf_matrix,documents_chunk

In [23]:
vectorizer,tfidf_matrix,documents_chunk=indexing(documents,chunk_size=100,chunk_overlap=10)

In [24]:
def retrieval(vectorizer,tfidf_matrix,input_text,documents_chunk,k):
    input_vector = vectorizer.transform([input_text])
    similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    most_similar_indices = similarities.argsort()[::-1][:k]
    relevant_chunk=[]
    for idx in most_similar_indices:
        relevant_chunk.append(documents_chunk[idx])
        print(f"Document {idx}:")
        print(f"Text: {documents_chunk[idx]}")
        print(f"Cosine similarity score: {similarities[idx]:.4f}\n")
    return relevant_chunk

In [25]:
def generation(prompt,llm):
    response = llm.generate_content(prompt)
    return response.text

In [26]:
def ragChain(input_text, vectorizer,tfidf_matrix,documents_chunk,llm, k):
    relevant_chunks = retrieval(vectorizer,tfidf_matrix,input_text,documents_chunk,k)
    
    prompt = f"""
    You are a helpful Document Assistant that can answer questions based on the provided document.

    Answer the following question: `{input_text}`
    By searching the following document content: `{relevant_chunks}`

    Only use factual information from the document content to answer the question.

    If you feel like you don`t have enough information to answer the question, say directly "I don`t know".
    Your answer should be detailed but easy to understand.
    """

    response=generation(prompt,llm)
    return response

In [27]:
input_text="What is Machine Learning"
print(f"Input: {input_text}")
response=ragChain(input_text, vectorizer,tfidf_matrix,documents_chunk,llm, k=2)
print(f"Output: {response}")

Input: What is Machine Learning
Document 0:
Text: Machine learning algorithms are computational models that allow computers to understand patterns and forecast or make judgments based on data without explicit programming. These algorithms form the foundation of modern artificial intelligence and are used in various applications, including image and speech recognition, natural language processing, recommendation systems, fraud detection, autonomous cars, etc. This Machine learning Algorithms article will cover all the essential algorithms of machine learning like Support vector machine, decision-making, logistics regression, naive bayees classifier, random forest, k-mean clustering, reinforcement learning, vector, hierarchical clustering, xgboost, adaboost, logistics, etc. Types of Machine Learning Algorithms There are four types of
Cosine similarity score: 0.5041

Document 1:
Text: Types of Machine Learning Algorithms There are four types of machine learning algorithms 1. Supervised Le

In [28]:
input_text="Types of Machine Learning"
print(f"Input: {input_text}")
response=ragChain(input_text, vectorizer,tfidf_matrix,documents_chunk,llm, k=2)
print(f"Output: {response}")

Input: Types of Machine Learning
Document 0:
Text: Machine learning algorithms are computational models that allow computers to understand patterns and forecast or make judgments based on data without explicit programming. These algorithms form the foundation of modern artificial intelligence and are used in various applications, including image and speech recognition, natural language processing, recommendation systems, fraud detection, autonomous cars, etc. This Machine learning Algorithms article will cover all the essential algorithms of machine learning like Support vector machine, decision-making, logistics regression, naive bayees classifier, random forest, k-mean clustering, reinforcement learning, vector, hierarchical clustering, xgboost, adaboost, logistics, etc. Types of Machine Learning Algorithms There are four types of
Cosine similarity score: 0.6230

Document 1:
Text: Types of Machine Learning Algorithms There are four types of machine learning algorithms 1. Supervised L

In [29]:
input_text="What is Supervised Learning"
print(f"Input: {input_text}")
response=ragChain(input_text, vectorizer,tfidf_matrix,documents_chunk,llm, k=2)
print(f"Output: {response}")

Input: What is Supervised Learning
Document 2:
Text: Linear Discriminant Analysis (LDA) Independent Component Analysis (ICA) UMAP (Uniform Manifold Approximation and Projection) C. Association Apriori Algorithm Eclat Algorithm 3. Reinforcement Learning A. Model-Free Methods Q-Learning Deep Q-Network (DQN) SARSA (State-Action-Reward-State-Action) Policy Gradient Methods (e.g., REINFORCE) B. Model-Based Methods Deep Deterministic Policy Gradient (DDPG) Proximal Policy Optimization (PPO) Trust Region Policy Optimization (TRPO) C. Value-Based Methods Monte Carlo Methods Temporal Difference (TD) Learning 4. Ensemble Learning Bagging (e.g., Random Forest) Boosting (e.g., AdaBoost, Gradient Boosting) Stacking 1. Supervised Learning Supervised learning involves training a model on labeled data, where the desired output is known. The model learns to map inputs to outputs based on the provided examples. A. Classification
Cosine similarity score: 0.2477

Document 17:
Text: ones. A. Model-Free Met