In [60]:
!pip install transformers pdfplumber scikit-learn



In [74]:
import pdfplumber
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import numpy as np

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Step 2: Chunk the text
def chunk_text(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Step 3: Create TF-IDF Vectorizer
def create_vectorizer(chunks):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)
    return vectorizer, tfidf_matrix

# Step 4: Initialize the language model
generator = pipeline('text-generation', model='gpt2')

# Step 5: Retrieve relevant chunks
def retrieve(query, vectorizer, tfidf_matrix, top_n=3):
    query_vector = vectorizer.transform([query])
    cosine_similarities = (query_vector * tfidf_matrix.T).toarray()
    most_similar_indices = np.argsort(cosine_similarities[0])[-top_n:][::-1]
    return most_similar_indices, cosine_similarities

# Step 6: Generate concise and relevant response
def generate_response(query, pdf_path):
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text)
    vectorizer, tfidf_matrix = create_vectorizer(chunks)

    indices, cosine_similarities = retrieve(query, vectorizer, tfidf_matrix)
    contexts = [chunks[i] for i in indices]

    # Combine contexts for better relevance
    combined_context = " ".join(contexts)

    # Check if the combined context is relevant
    if len(combined_context) < 50 or all(cosine_similarities[0][i] < 0.1 for i in indices):  # Arbitrary threshold for context length and relevance
        return "I do not know."

    # Generate a concise answer based on the relevant context
    response = generator(
        f"Please provide a concise answer to the question: '{query}' based on the following context: {combined_context}",
        max_length=50,  # Set this to a value greater than the input length
        max_new_tokens=20,  # Specify how many new tokens to generate
        truncation=True
    )
    generated_text = response[0]['generated_text'].strip()

    # Clean up the generated text to make it more readable
    if "Answer the question:" in generated_text:
        generated_text = generated_text.split("Answer the question:")[1].strip()

    return generated_text if generated_text else "I do not know."

# Example usage
if __name__ == "__main__":
    pdf_path = "MLMaterial.pdf"  # Ensure to provide a valid PDF file path
    user_query = input("Please enter your query: ")  # Allow user to input their query
    response = generate_response(user_query, pdf_path)

    # Display only the query and the response
    print(response)  # Only print the response without any additional context or information

Please enter your query: what is overfitting?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Please provide a concise answer to the question: 'what is overfitting?' based on the following context: of overfitting increase as much we provide training to our
▪ model.
▪ It means the more we train our model, the more chances of occurring the overfitted
▪ model.
▪ Overfitting is the main problem that occurs in supervised learning.
▪ Avoid the Overfitting in Model
▪ Cross-Validation
▪ Training with more data
▪ Removing features
▪ Early stopping the training
▪ Regularization
▪ Ensembling
• Goodness of Fit
▪ The "Goodness of fit" term is taken from the statistics, and the goal of the machine learning
models to achieve the goodness of fit.
▪ In statistics modelling, it defines how closely the result or predicted values match the true
values of the dataset.
▪ The model with a good fit is between the underfitted and overfitted model, and ideally, it
makes predictions with 0 errors, but in practice, it is difficult to achieve it.
▪ As when we train our model for a time, the errors in the t