In [1]:
import os
import google.generativeai as genai

# Configure Gemini API
genai.configure(api_key="AIzaSyCcwlM1dfH8YGO_mIJCv2GshH5ItdwgODE")

# Function to chunk text into smaller pieces
def chunk_text(text, chunk_size=1000):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Read and process each file in the dataset folder
dataset_folder = "dataset"
chunks = {}

for file in os.listdir(dataset_folder):
    if file.endswith(".txt"):
        with open(os.path.join(dataset_folder, file), "r", encoding="utf-8") as f:
            text = f.read()
        
        chunks[file] = chunk_text(text)

print("✅ Preprocessing Complete! Text chunks created successfully.")

# Save preprocessed chunks (optional)
import json

with open("dataset/preprocessed_chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=4)

print("✅ Preprocessed data saved as 'preprocessed_chunks.json'")

✅ Preprocessing Complete! Text chunks created successfully.
✅ Preprocessed data saved as 'preprocessed_chunks.json'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

# Load preprocessed data
with open("dataset/preprocessed_chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

# Clean data
cleaned_chunks = {}
for file, text_chunks in chunks.items():
    unique_chunks = list(set(chunk.strip() for chunk in text_chunks if len(chunk.strip()) > 10))  # Remove empty/short
    cleaned_chunks[file] = unique_chunks

# Save cleaned data
with open("dataset/cleaned_chunks.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_chunks, f, ensure_ascii=False, indent=4)

print("✅ Preprocessed data cleaned and saved as 'cleaned_chunks.json'")

✅ Preprocessed data cleaned and saved as 'cleaned_chunks.json'


In [3]:
pip install faiss-cpu sentence-transformers google-generativeai

Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.6.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.1.0-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Collecting filelock (from hug

In [4]:
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load cleaned data
with open("dataset/cleaned_chunks.json", "r", encoding="utf-8") as f:
    cleaned_chunks = json.load(f)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare data for FAISS
text_list = []
vector_list = []

for file, text_chunks in cleaned_chunks.items():
    for chunk in text_chunks:
        text_list.append(chunk)
        vector_list.append(model.encode(chunk))  # Convert text to embedding

# Convert to numpy array
vector_array = np.array(vector_list, dtype=np.float32)

# Initialize FAISS index
index = faiss.IndexFlatL2(vector_array.shape[1])
index.add(vector_array)  # Add vectors to FAISS

# Save FAISS index and text mappings
faiss.write_index(index, "dataset/faiss_index_cleaned.bin")

with open("dataset/text_mappings_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(text_list, f, ensure_ascii=False, indent=4)

print("✅ FAISS index and cleaned text mappings saved!")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


✅ FAISS index and cleaned text mappings saved!


In [5]:
import faiss
import numpy as np
import json
import os
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

# ✅ Configure Gemini API with error handling
API_KEY = "AIzaSyCcwlM1dfH8YGO_mIJCv2GshH5ItdwgODE"  # Replace with your actual API key
if not API_KEY or "AIza" not in API_KEY:
    raise ValueError("❌ Invalid API Key! Please provide a valid Gemini API key.")

genai.configure(api_key=API_KEY)

# ✅ Load FAISS index & text mappings
FAISS_INDEX_PATH = "dataset/faiss_index_cleaned.bin"
TEXT_MAPPING_PATH = "dataset/text_mappings_cleaned.json"

if not os.path.exists(FAISS_INDEX_PATH):
    raise FileNotFoundError(f"❌ FAISS index not found at {FAISS_INDEX_PATH}")

if not os.path.exists(TEXT_MAPPING_PATH):
    raise FileNotFoundError(f"❌ Text mapping file not found at {TEXT_MAPPING_PATH}")

index = faiss.read_index(FAISS_INDEX_PATH)

with open(TEXT_MAPPING_PATH, "r", encoding="utf-8") as f:
    text_list = json.load(f)

# ✅ Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Function to retrieve relevant text using FAISS
def retrieve_relevant_chunks(query, top_k=3):
    query_vector = np.array([model.encode(query)], dtype=np.float32)
    distances, indices = index.search(query_vector, top_k)
    
    return [text_list[i] for i in indices[0] if i < len(text_list)]  # Ensure valid indices

# ✅ Function to query Gemini API
def query_gemini(user_query):
    context = "\n\n".join(retrieve_relevant_chunks(user_query))  # Get relevant text
    
    prompt = f"Use the following context to answer:\n\n{context}\n\nUser Question: {user_query}"
    
    model = genai.GenerativeModel("gemini-1.5-pro-latest")
    response = model.generate_content(prompt)
    
    return response.text if response and hasattr(response, "text") else "❌ No response from Gemini API."

# ✅ Example query
user_question = "What are the courses offered at KSSEM?"
answer = query_gemini(user_question)

print("\nChatbot Response:", answer)


Chatbot Response: The provided text doesn't list the specific courses offered at KSSEM. It mentions programs like BE/B.Tech, MBA, and MCA in the context of checking VTU results, implying that these programs are likely offered.  It also mentions research opportunities in AI, IoT, Robotics, VLSI, and renewable energy, suggesting related courses might exist.  However, without a course catalog or departmental information, it's impossible to provide a definite list of all courses.



In [6]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyB917x1rn4U9QaRj_T_N6g07T9khb89reM")

models = genai.list_models()
for model in models:
    print(model.name)

models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/learnlm-1.5-pro-experim

In [7]:
print("FAISS Index Size:", index.ntotal)
print("Sample Text Chunk:", text_list[:2])  # Print first 2 chunks

FAISS Index Size: 18
Sample Text Chunk: ["must do a dissertation/project work in their final semester. 15. Are professors available for research with students? Response: Yes, KSSEM encourages student participation in research. Some faculty members guide students in AI, IoT, Robotics, VLSI, and renewable energy projects. There are opportunities to publish papers and participate in hackathons, conferences, and project exhibitions. 16. Is it popular to study abroad? Response: Studying abroad is not very common, but some students pursue higher studies (MS, MBA) in the USA, Canada, and Germany after graduation. KSSEM provides guidance for GRE, TOEFL, and IELTS preparations through its career development programs. CAMPUS LIFE:- Life as a First-Year Student at KSSEM: 1. What's it like to be a first-year student here? Response: Being a first-year student at KSSEM is an exciting yet challenging experience. The college provides orientation programs to help new students adjust. Faculty members ar

In [8]:
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer

# Load FAISS index & text mappings
index = faiss.read_index("dataset/faiss_index_cleaned.bin")
with open("dataset/text_mappings_cleaned.json", "r", encoding="utf-8") as f:
    text_list = json.load(f)

# Load embedding model correctly
model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to retrieve relevant chunks using FAISS
def retrieve_relevant_chunks(query, top_k=3):
    query_vector = np.array([model.encode(query)], dtype=np.float32)
    distances, indices = index.search(query_vector, top_k)
    return [text_list[i] for i in indices[0]]  # Return top-k chunks

# Function to query Gemini API (Ensure this is defined)
def query_gemini(user_query):
    context = "\n\n".join(retrieve_relevant_chunks(user_query))  # Get relevant text
    
    prompt = f"""You are an AI chatbot providing accurate information about an educational institution.  
    Use the context below to answer the user's question:  

    ### Context:  
    {context}  

    ### User Question:  
    {user_query}  

    ### Answer:"""

    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    
    return response.text if response else "❌ I couldn't find an answer. Try rephrasing!"

# Test the function
user_question = "What are the courses offered at KSSEM?"
retrieved_chunks = retrieve_relevant_chunks(user_question)

print("🔹 Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"{i}. {chunk}")

answer = query_gemini(user_question)
print("\n🤖 Chatbot Response:", answer)

🔹 Retrieved Chunks:
1. 1.The official working hours for K.S. School of Engineering and Management (KSSEM) are not explicitly stated in the available sources. However, the library operates from Monday to Saturday, 8:40 a.m. to 4:00 p.m., extending to 5:00 p.m. during regular exams. Additionally, an AI & ML offline training program was scheduled from 9:00 a.m. to 4:00 p.m. These timings suggest that the college's operational hours likely align with a typical academic schedule, starting in the morning and concluding in the late afternoon. 2.The lunch hours at K.S. School of Engineering and Management (KSSEM) are from 12:35 PM to 1:20 PM. 3.The canteen at K.S. School of Engineering and Management (KSSEM) offers a diverse menu catering to various culinary preferences. While specific dishes are not listed in the available sources, the canteen provides a wide variety of food options, ranging from Chinese to South Indian cuisines. The facility emphasizes cleanliness and hygiene, ensuring that 

In [9]:
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer

# Load FAISS index & text mappings
index = faiss.read_index("dataset/faiss_index_cleaned.bin")
with open("dataset/text_mappings_cleaned.json", "r", encoding="utf-8") as f:
    text_list = json.load(f)

# Load embedding model correctly
model = SentenceTransformer("all-MiniLM-L6-v2")

# Improved retrieval function
def retrieve_relevant_chunks(query, top_k=3):
    query_vector = np.array([model.encode(query)], dtype=np.float32)
    distances, indices = index.search(query_vector, top_k)
    
    retrieved_texts = [text_list[i] for i in indices[0]]

    # Keyword filtering to improve relevance
    keywords = ["course", "program", "degree", "B.E", "B.Tech", "M.Tech", "MBA"]
    filtered_texts = [text for text in retrieved_texts if any(kw in text.lower() for kw in keywords)]

    # If keyword filtering removes all, return original retrieved texts
    return filtered_texts if filtered_texts else retrieved_texts  

# Function to query Gemini API
def query_gemini(user_query):
    context = "\n\n".join(retrieve_relevant_chunks(user_query))  # Get relevant text
    
    prompt = f"""You are an AI chatbot providing accurate information about an educational institution.  
    Use the context below to answer the user's question:  

    ### Context:  
    {context}  

    ### User Question:  
    {user_query}  

    ### Answer:"""

    model = genai.GenerativeModel("gemini-1.5-pro")
    response = model.generate_content(prompt)
    
    return response.text if response else "❌ I couldn't find an answer. Try rephrasing!"

# Test the function
user_question = "What are the courses offered at KSSEM?"
retrieved_chunks = retrieve_relevant_chunks(user_question)

print("🔹 Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"{i}. {chunk}")

answer = query_gemini(user_question)
print("\n🤖 Chatbot Response:", answer)

🔹 Retrieved Chunks:
1. 1.The official working hours for K.S. School of Engineering and Management (KSSEM) are not explicitly stated in the available sources. However, the library operates from Monday to Saturday, 8:40 a.m. to 4:00 p.m., extending to 5:00 p.m. during regular exams. Additionally, an AI & ML offline training program was scheduled from 9:00 a.m. to 4:00 p.m. These timings suggest that the college's operational hours likely align with a typical academic schedule, starting in the morning and concluding in the late afternoon. 2.The lunch hours at K.S. School of Engineering and Management (KSSEM) are from 12:35 PM to 1:20 PM. 3.The canteen at K.S. School of Engineering and Management (KSSEM) offers a diverse menu catering to various culinary preferences. While specific dishes are not listed in the available sources, the canteen provides a wide variety of food options, ranging from Chinese to South Indian cuisines. The facility emphasizes cleanliness and hygiene, ensuring that 

In [13]:
def query_gemini(user_query):
    retrieved_chunks = retrieve_relevant_chunks(user_query)  # Get relevant text
    context = "\n\n".join(retrieved_chunks)  # Convert list to string
    
    if not context:  # If no relevant context is found, handle it
        context = "No relevant information was found in the knowledge base."
    
    prompt = f"""You are an AI chatbot providing accurate information about an educational institution.  
    Use the context below to answer the user's question:  

    ### Context:  
    {context}  

    ### User Question:  
    {user_query}  

    ### Answer:"""
    
    model = genai.GenerativeModel("gemini-1.5-pro-latest")  # Choose a working model
    response = model.generate_content(prompt)
    
    return response.text if response and hasattr(response, "text") else "❌ No response from Gemini API."

In [14]:
user_question = "What are the courses offered at KSSEM?"
answer = query_gemini(user_question)
print("\nChatbot Response:", answer)


Chatbot Response: This question cannot be answered from the given context. While the context provides information about campus life, facilities, and student activities at KSSEM, it does not list the specific courses offered.

