In [8]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)


from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# Extract data
pdf_text = extract_text_from_pdf("data.pdf")
print(pdf_text[:500])  # Print first 500 characters



What
 
degree
 
is
 
Tejas
 
Gaikwad
 
currently
 
pursuing?
 
Master
 
of
 
Science
 
in
 
Computer
 
Science
 
at
 
the
 
State
 
University
 
of
 
New
 
York
 
at
 
Buffalo.
 
Which
 
courses
 
has
 
Tejas
 
completed
 
during
 
his
 
master's
 
program?
 
Machine
 
Learning,
 
Analysis
 
of
 
Algorithms,
 
Data
 
Intensive
 
Computing,
 
and
 
Computer
 
Security.
 
Where
 
did
 
Tejas
 
complete
 
his
 
undergraduate
 
studies?
 
Vishwakarma
 
Institute
 
of
 
Technology
 
in
 
Pune,
 
Indi


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

# Split the extracted text into chunks
chunks = split_text(pdf_text)
print(f"Total Chunks: {len(chunks)}")
print(chunks[:2])  # Print first two chunks


Total Chunks: 55
["What\n \ndegree\n \nis\n \nTejas\n \nGaikwad\n \ncurrently\n \npursuing?\n \nMaster\n \nof\n \nScience\n \nin\n \nComputer\n \nScience\n \nat\n \nthe\n \nState\n \nUniversity\n \nof\n \nNew\n \nYork\n \nat\n \nBuffalo.\n \nWhich\n \ncourses\n \nhas\n \nTejas\n \ncompleted\n \nduring\n \nhis\n \nmaster's\n \nprogram?\n \nMachine\n \nLearning,\n \nAnalysis\n \nof\n \nAlgorithms,\n \nData\n \nIntensive\n \nComputing,\n \nand\n \nComputer\n \nSecurity.\n \nWhere\n \ndid\n \nTejas\n \ncomplete\n \nhis\n \nundergraduate\n \nstudies?\n \nVishwakarma\n \nInstitute\n \nof\n \nTechnology\n \nin\n \nPune,", "his\n \nundergraduate\n \nstudies?\n \nVishwakarma\n \nInstitute\n \nof\n \nTechnology\n \nin\n \nPune,\n \nIndia.\n \nWhat\n \nwas\n \nTejas's\n \nmajor\n \nduring\n \nhis\n \nbachelor's\n \ndegree?\n \nElectronics\n \nand\n \nTelecommunication\n \nEngineering.\n \nWhich\n \nprogramming\n \nlanguages\n \nis\n \nTejas\n \nproficient\n \nin?\n \nPython,\n \nR,\n \nC++,\n \nG

In [11]:
!pip install -U langchain-huggingface
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"



Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [12]:
from langchain_huggingface import HuggingFaceEmbeddings
import faiss
import numpy as np

# Load DeepSeek embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Convert text chunks into embeddings
embeddings = np.array([embedding_model.embed_query(chunk) for chunk in chunks])

# Create FAISS index
dimension = embeddings.shape[1]  # Get the embedding dimension
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Save the index
faiss.write_index(index, "faiss_index.bin")


In [13]:
def retrieve_similar_text(query, k=3):
    query_embedding = np.array([embedding_model.embed_query(query)])
    D, I = index.search(query_embedding, k)  # Get top k results
    return [chunks[i] for i in I[0]]  # Retrieve corresponding text chunks

# Test Retrieval
query = "Where did Tejas complete his undergraduate studies?"
retrieved_texts = retrieve_similar_text(query)
print("Retrieved Text:", retrieved_texts)


Retrieved Text: ["his\n \nundergraduate\n \nstudies?\n \nVishwakarma\n \nInstitute\n \nof\n \nTechnology\n \nin\n \nPune,\n \nIndia.\n \nWhat\n \nwas\n \nTejas's\n \nmajor\n \nduring\n \nhis\n \nbachelor's\n \ndegree?\n \nElectronics\n \nand\n \nTelecommunication\n \nEngineering.\n \nWhich\n \nprogramming\n \nlanguages\n \nis\n \nTejas\n \nproficient\n \nin?\n \nPython,\n \nR,\n \nC++,\n \nGo,\n \nSQL\n \n(Postgres),\n \nJavaScript,\n \nand\n \nHTML/CSS.\n \nWhat\n \nframeworks\n \nhas\n \nTejas\n \nworked\n \nwith?\n \nReact,\n \nNode.js,\n \nFlask,\n \nFastAPI,\n \nSpring\n \nBoot,\n \nSpark,\n \nHadoop,", "What\n \ndegree\n \nis\n \nTejas\n \nGaikwad\n \ncurrently\n \npursuing?\n \nMaster\n \nof\n \nScience\n \nin\n \nComputer\n \nScience\n \nat\n \nthe\n \nState\n \nUniversity\n \nof\n \nNew\n \nYork\n \nat\n \nBuffalo.\n \nWhich\n \ncourses\n \nhas\n \nTejas\n \ncompleted\n \nduring\n \nhis\n \nmaster's\n \nprogram?\n \nMachine\n \nLearning,\n \nAnalysis\n \nof\n \nAlgorithms,\n \

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# ✅ Load the latest DeepSeek model
model_name = "deepseek-ai/DeepSeek-R1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

def generate_response(question):
    """Generate response from DeepSeek-R1 with retrieved context."""
    # ✅ Retrieve relevant chunks for better contextual answers
    context = retrieve_similar_text(question)
    prompt = f"Context:\n{'\n'.join(context)}\n\nQuestion: {question}\nAnswer:"

    # ✅ Tokenize input with better memory handling
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda" if torch.cuda.is_available() else "cpu")

    # ✅ Generate response efficiently
    output = model.generate(**inputs, max_new_tokens=150, temperature=0.7, top_p=0.9)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Test the chatbot
question = "What degree is Tejas Gaikwad currently pursuing?"
response = generate_response(question)
print(response)
