In [2]:
#remove the ouptput stars 
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
import os

print("successful imported packages")


#set environment for api key

load_dotenv("google_api_key.env") #collect api key and save file .env 



# 1. Load Documents (PDF, DOCX, TXT)

documents = []

documents.extend(PyPDFLoader(r"D:\\1. study material college\\8th sem\\ML and AI\\ML & AI Notes.pdf").load())
#documents.extend(Docx2txtLoader("data/sample.docx").load())
#documents.extend(TextLoader("data/sample.txt").load())

print(f"Loaded {len(documents)} documents")
#print(documents)

print("seccessfully completed load document in document variable/list")



# 2. Split Documents into Chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=550,
    chunk_overlap=50
)
#chunk_size=Each chunk will contain at most 550 characters

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} text chunks")


print("\nchunking step is over\n")



#use hugging face for the create embedding and store the vetctor data 


embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
)

print("\nHuggingFace Embeddings loaded successfully\n")






#vectors store 
VECTOR_DIR = "rag_vector_index"

if os.path.exists(VECTOR_DIR):
    print("Loading existing vector store...")
    vectorstore = FAISS.load_local(
        VECTOR_DIR,
        embeddings,
        allow_dangerous_deserialization=True
    )
else:
    print("Creating new vector store...")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(VECTOR_DIR)
    print("Vector store created and saved")


print("\n Vectors Stored Successfully\n")

#prompt to a model

document_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""


STRICT OUTPUT RULES:
- Do NOT use stars (*), markdown, or special formatting.
- Use plain text headings.
- Under each heading, use bullet points starting with '•'.
- Keep language formal, clear, and professional.
- every section need in bold text format and size=20

Use ONLY the information provided below.

====================
REFERENCE CONTEXT
====================
{context}

====================
PROBLEM STATEMENT
====================
{question}

====================
DOCUMENT STRUCTURE
====================

Problem Overview:
- Explain the problem clearly
- Describe why dimensionality reduction is required

Background:
- Define Principal Component Analysis (PCA)
- Explain its purpose in data analysis

Requirements:
- List technical and functional requirements as bullet points

Proposed Solution:
- Explain how PCA solves the problem
- Describe the approach conceptually

Technical Architecture:
- Explain PCA computation steps
- Mention eigenvectors, eigenvalues, and variance

Implementation Steps:
- List step-by-step PCA implementation workflow

Risks and Mitigation:
- List possible risks
- Explain mitigation strategies

Benefits:
- List advantages of PCA

Conclusion:
- Summarize the solution clearly

Generate a clean, well-structured document following the rules exactly.
"""
)


# 10. Initialize Gemini LLM
#firstly you have need choose LLM model and Create the api key
llm = ChatGoogleGenerativeAI(
    model="gemini-3-flash-preview",
    temperature=0.3
    
)

print("\n Gemini model imported \n")
print("*"*40)


# 11. Create RAG Chain

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever, #retrive the vectors from vector database
    chain_type="stuff",#Stuff = concatenate all retrieved documents into ONE prompt
    chain_type_kwargs={"prompt": document_prompt} # You override the default prompt and You inject your own PromptTemplate
)


# 12. Ask Question

problem_statement = input("enter your problem statement : ")


# 13. Generate Answer

final_document = rag_chain.run(problem_statement)   # 

print("\n========== GENERATED DOCUMENT ==========\n")
print(final_document)

# Save in file 

#with open("generated_document.txt", "w+", encoding="utf-8") as f:
#    f.write(final_document)

#print("\nDocument saved as generated_document.txt")


successful imported packages
Loaded 81 documents
seccessfully completed load document in document variable/list
Created 395 text chunks

chunking step is over



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



HuggingFace Embeddings loaded successfully

Loading existing vector store...

 Vectors Stored Successfully


 Gemini model imported 

****************************************


enter your problem statement generate the PCA report


  warn_deprecated(
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised DeadlineExceeded: 504 Deadline expired before operation could complete..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised DeadlineExceeded: 504 Deadline expired before operation could complete..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 8.0 seconds as it raised DeadlineExceeded: 504 Deadline expired before operation could complete..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 16.0 seconds as it raised DeadlineExceeded: 504 Deadline expired before operation could complete..




Problem Overview
• High-dimensional datasets often contain redundant or noisy information that can obscure meaningful patterns.
• Dimensionality reduction is required to simplify the data into a lower-dimensional representation while retaining essential characteristics.
• This process is necessary for effectively visualizing complex data and improving computational efficiency.

Background
• Principal Component Analysis (PCA) is a statistical procedure used to reduce the dimensionality of a dataset.
• Its primary purpose in data analysis is to visualize high-dimensional data and remove redundant or noisy components.

Requirements
• The original features of the dataset must be standardized to ensure they are mean-centered and scaled.
• The system must be capable of computing a covariance matrix from the standardized features.
• The process requires the ability to perform eigenvalue decomposition on the covariance matrix.
• Functional selection criteria must be applied to retain eigenve

In [None]:
#document saving 
from docx import Document
from docx.shared import Pt

def file_save():
    doc = Document()
    
    SECTION_TITLES = {
        "PROBLEM OVERVIEW",
        "BACKGROUND",
        "REQUIREMENTS",
        "PROPOSED SOLUTION",
        "TECHNICAL ARCHITECTURE",
        "IMPLEMENTATION STEPS",
        "RISKS AND MITIGATION",
        "BENEFITS",
        "CONCLUSION"
    }
    
    for line in final_document.split("\n"):
        line = line.strip()
    
        if not line:
            doc.add_paragraph("")
            continue
    
        p = doc.add_paragraph()
        run = p.add_run(line)
    
        if line in SECTION_TITLES:
            run.bold = True
            run.font.size = Pt(20)
        else:
            run.font.size = Pt(11)
    
    doc.save(r"C:\Users\91957\Downloads\PCA.docx")# where you want save your file
    print("successfully save in you file manager, pls check it")

file_save() 