In [None]:
pip install transformers datasets scikit-learn


In [None]:
import json
import random

data = []

# 🎯 GREETING INTENTS
greeting_examples = [
    "Hi there",
    "Hello!",
    "Hey!",
    "Good morning",
    "Good evening",
    "How are you?",
    "Hi, can you help me?",
    "Hey, is this Mahindra University's assistant?",
    "Yo!",
    "Hi assistant",
    "Hello, anyone there?",
    "Greetings!",
    "Hey, good to see you",
    "Hi, nice to meet you"
    "hello", "hi", "hey", "yo", "hello there", "hi there", "sup", "heya"
]

# 📘 FAQ INTENTS
faq_examples = [
    "What is the eligibility for B.Tech admission?",
    "What are the hostel facilities available?",
    "When do the classes for the Fall semester begin?",
    "Is there a placement cell at Mahindra University?",
    "How much is the tuition fee for M.Tech?",
    "Are there scholarships for undergraduate students?",
    "Can I apply without a JEE Main score?",
    "Is there a management quota for admissions?",
    "How can I pay my fee online?",
    "What is the last date to apply for PhD?",
    "What companies come for placements?",
    "How many seats are available in CSE?",
    "Are the hostels air-conditioned?",
    "Do you offer a Data Science program?",
    "Can I change my branch after 1st year?",
    "What is the refund policy for admissions?",
    "How is the ECE department at Mahindra University?",
    "Do you have international exchange programs?",
    "What is the minimum CGPA required for placements?",
    "How do I contact the admissions office?"
]

# Helper to format
def add_examples(intent, examples):
    return [{"text": ex, "intent": intent} for ex in examples]

# Build training dataset (greeting + faq only)
data += add_examples("greeting", greeting_examples)
data += add_examples("faq", faq_examples)

# Shuffle for realism
random.shuffle(data)

# Save to JSON file
with open("mu_intents.json", "w") as f:
    json.dump(data, f, indent=4)

print(f"✅ Dataset saved with {len(data)} examples to mu_intents.json")


In [None]:
import json
import csv

# Load the cleaned JSON file
with open("mu_intents.json", "r") as f:
    data = json.load(f)

# Save as CSV
with open("mu_intents.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["text", "intent"])
    writer.writeheader()
    writer.writerows(data)

print("✅ Converted mu_intents.json to mu_intents.csv")


In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder

# Step 1: Load CSV dataset
dataset = load_dataset("csv", data_files={"train": "mu_intents.csv"}, split="train")

# Step 2: Extract all intent labels and fit the encoder
labels = [example['intent'] for example in dataset]
label_encoder = LabelEncoder()
label_encoder.fit(labels)

# Step 3: Add numeric labels to dataset
dataset = dataset.map(lambda x: {"label": label_encoder.transform([x["intent"]])[0]})

# Step 4: Save label mapping
label_list = list(label_encoder.classes_)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Step 5: Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Step 6: Load model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


training_args = TrainingArguments(
    output_dir="./mu_intent_model",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Train
trainer.train()

# Save model
model.save_pretrained("./mu_intent_model")
tokenizer.save_pretrained("./mu_intent_model")
print("✅ Model trained and saved to ./mu_intent_model")


In [None]:
from transformers import pipeline

# Load the trained model
classifier = pipeline("text-classification", model="./mu_intent_model")

# Intent inference function
def classify_intent(text, threshold=0.80):
    result = classifier(text)[0]
    label = result["label"]
    score = result["score"]

    if score < threshold:
        return "open_query", score
    return label, score

# Sample test queries
test_inputs = [
    "Hi there!",
    "Good morning, how are you?",
    "What is the eligibility for M.Tech?",
    "Can you explain deep learning?",
    "How to apply for hostel?",
    "Yo!",
    "Do you offer data science courses?",
    "What are quantum computers?",
    "Hey, nice to meet you",
    "Can I apply without JEE?"
    "hello",
    "hi",
    "hello", "hi", "hey", "yo", "hello there", "hi there", "sup", "heya"
]
test_inputs += [
    "Can you tell me the admission criteria for B.Tech?",
    "What are the hostel rooms like?",
    "When does the academic year start?",
    "Is there any placement assistance available?",
    "How much do I have to pay for the M.Tech course?",
    "Are UG students eligible for any scholarships?",
    "Can I get into the college without JEE?",
    "Is there a special quota for management students?",
    "Where do I go to pay my fees online?",
    "By when should I apply for PhD programs?",
    "Which companies come for campus hiring?",
    "How many CSE seats do you offer?",
    "Do hostels come with AC?",
    "Is Data Science part of your course offerings?",
    "Can I switch my stream after one year?",
    "If I cancel, will I get a refund?",
    "How good is the ECE branch at MU?",
    "Any opportunities for studying abroad?",
    "What's the CGPA needed to sit for placements?",
    "Who do I contact for admission queries?",
    "Are late fee charges applicable?",
    "How do I apply for scholarships again?",
    "What facilities are there in the library?",
    "Do you have clubs or societies for students?",
    "Is attendance mandatory for classes?",
    "How many years does it take to complete B.Tech?",
    "What are the timings of the admission office?",
    "Is there transportation available to campus?",
    "Do you offer M.Tech in AI or ML?",
    "Can I visit the college before admission?",
    "What is the total intake for all programs?"
]


# Run test
print("📊 Testing intent classification:\n")
for query in test_inputs:
    intent, score = classify_intent(query)
    print(f"Query: {query}\n→ Predicted Intent: {intent} (Confidence: {score:.2f})\n")


In [None]:
import json

faq_data = [
    {
        "question": "What is the eligibility for B.Tech admission?",
        "answer": "To be eligible for B.Tech at Mahindra University, candidates must have completed 10+2 with Physics, Chemistry, and Mathematics, and have a valid score in JEE Mains or SAT."
    },
    {
        "question": "What are the hostel facilities available?",
        "answer": "Mahindra University offers separate hostel facilities for boys and girls with options for single, double, and triple occupancy rooms. Hostels are equipped with Wi-Fi, mess, laundry, and recreational areas."
    },
    {
        "question": "When do the classes for the Fall semester begin?",
        "answer": "The Fall semester at Mahindra University typically begins in the first week of August. The exact dates are announced in the academic calendar on the university website."
    },
    {
        "question": "Is there a placement cell at Mahindra University?",
        "answer": "Yes, Mahindra University has a dedicated Career Services Office that assists students with internships, placements, resume building, and interview preparation."
    },
    {
        "question": "How much is the tuition fee for M.Tech?",
        "answer": "The tuition fee for M.Tech programs at Mahindra University is approximately ₹2,50,000 per year, but it may vary by department. Please check the admissions portal for updated details."
    },
    {
        "question": "Are there scholarships for undergraduate students?",
        "answer": "Yes, Mahindra University offers merit-based and need-based scholarships for undergraduate students. Eligibility criteria and application details are available on the scholarship section of the website."
    },
    {
        "question": "Can I apply without a JEE Main score?",
        "answer": "Yes, students can also apply through SAT scores if they do not have JEE Main scores. Both are accepted for B.Tech admissions."
    },
    {
        "question": "Is there a management quota for admissions?",
        "answer": "No, Mahindra University does not have a management quota. All admissions are purely based on merit through JEE/SAT scores and academic performance."
    },
    {
        "question": "How can I pay my fee online?",
        "answer": "Fees can be paid online through the university's payment portal using debit/credit cards, net banking, or UPI. Login credentials will be provided after the admission process."
    },
    {
        "question": "What is the last date to apply for PhD?",
        "answer": "PhD applications are generally open twice a year, and the last date for submission is typically announced on the official website. Check the 'Admissions → PhD' section regularly."
    },
    {
        "question": "What companies come for placements?",
        "answer": "Companies like Microsoft, Infosys, TCS, Deloitte, Bosch, and L&T are among the frequent recruiters at Mahindra University. Placement reports are published annually on the website."
    },
    {
        "question": "How many seats are available in CSE?",
        "answer": "The Computer Science and Engineering (CSE) program at Mahindra University typically offers around 240 seats. This may vary slightly year to year."
    },
    {
        "question": "Are the hostels air-conditioned?",
        "answer": "Yes, selected hostel rooms are air-conditioned. Students can choose AC or non-AC options during hostel allotment."
    },
    {
        "question": "Do you offer a Data Science program?",
        "answer": "Yes, Mahindra University offers B.Tech and M.Tech programs with specialization in Data Science and Artificial Intelligence."
    },
    {
        "question": "Can I change my branch after 1st year?",
        "answer": "Yes, branch change is permitted after the first year based on academic performance and availability of seats in the desired department."
    },
    {
        "question": "What is the refund policy for admissions?",
        "answer": "Refunds are processed as per UGC guidelines. The percentage refunded depends on how early the withdrawal is requested before the start of classes."
    },
    {
        "question": "How is the ECE department at Mahindra University?",
        "answer": "The ECE department has experienced faculty, modern labs, and strong industry ties. It also supports research in VLSI, IoT, and communication systems."
    },
    {
        "question": "Do you have international exchange programs?",
        "answer": "Yes, Mahindra University has international collaborations with universities in the US, Europe, and Asia for semester exchange and research internships."
    },
    {
        "question": "What is the minimum CGPA required for placements?",
        "answer": "The minimum CGPA required varies by company, but generally a CGPA of 6.0 or above is required to be eligible for most placements."
    },
    {
        "question": "How do I contact the admissions office?",
        "answer": "You can contact the admissions office via email at admissions@mahindrauniversity.edu.in or call the official helpline number listed on the university website."
    }
]

# Save as JSON
with open("faq_dataset.json", "w") as f:
    json.dump(faq_data, f, indent=4)

print("✅ FAQ dataset saved to faq_dataset.json")


In [None]:
pip install transformers sentence-transformers faiss-cpu langchain


In [None]:
## only slm and classifier!!
# ------------------ Imports ------------------
import json
import numpy as np
import faiss
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from langchain.memory import ConversationBufferMemory

# ------------------ Load Intent Classifier ------------------
classifier = pipeline("text-classification", model="./mu_intent_model")

def classify_intent(text, threshold=0.8):
    text = text.lower().strip()
    result = classifier(text)[0]
    label = result["label"]
    score = result["score"]
    return label if score >= threshold else "open_query", score

# ------------------ Load FAQ Dataset ------------------
with open("faq_dataset.json", "r") as f:
    faq_data = json.load(f)

questions = [item["question"] for item in faq_data]
answers = [item["answer"] for item in faq_data]

# ------------------ Embed + Store in FAISS ------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")
question_embeddings = embedder.encode(questions, convert_to_numpy=True)

dimension = question_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(question_embeddings)

# ------------------ Load FLAN-T5-XL ------------------
flan_model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(flan_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(flan_model_name)

# ------------------ LangChain Memory ------------------
memory = ConversationBufferMemory(memory_key="chat_history", input_key="query")

# ------------------ Prompt Builder ------------------
def build_prompt(intent, query, context=None):
    chat_history = memory.buffer or ""

    if intent == "greeting":
        return (
            "You are Mahindra University's friendly virtual assistant.\n"
            f"{chat_history}"
            f"User: {query}\n"
            "Reply with a warm and respectful greeting. Offer help related to admissions, courses, or hostel facilities.\n"
            "Assistant:"
        )

    elif intent == "faq" and context:
        return (
            "You are Mahindra University's helpful assistant.\n"
            f"{chat_history}"
            f"User: {query}\n"
            f"University Info: {context}\n"
            "Answer the user's question clearly, respectfully, and helpfully in your own words.\n"
            "Assistant:"
        )

    return f"{chat_history}\nUser: {query}\nAssistant:"

# ------------------ FLAN Response Generator ------------------
def generate_flan_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_k=5,
        top_p=0.95,
        num_beams=1
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return response.strip('"“”').strip()

# ------------------ Chatbot Handler ------------------
def handle_query_with_memory(query):
    query = query.strip()
    if not query:
        return "open_query", "I'm here if you need anything related to Mahindra University."

    intent, _ = classify_intent(query)

    if intent == "greeting":
        prompt = build_prompt("greeting", query)
        bot_reply = generate_flan_response(prompt)

    elif intent == "faq":
        query_vec = embedder.encode([query])
        D, I = faiss_index.search(np.array(query_vec), k=1)
        idx = I[0][0]
        if idx < len(answers):
            matched_answer = answers[idx]
            prompt = build_prompt("faq", query, context=matched_answer)
            bot_reply = generate_flan_response(prompt)
        else:
            bot_reply = "Sorry, I couldn't find relevant info in our FAQ database."

    else:
        bot_reply = "That sounds like an open question. Let me connect you to someone who can assist further."

    memory.save_context({"query": query}, {"output": bot_reply})
    return intent, bot_reply

# ------------------ Demo ------------------
print("🤖 MahindraGPT (Flan-T5-XL Edition) is ready. Type 'exit' to quit.\n")

while True:
    user_input = input("👤 You: ")
    if user_input.lower() == "exit":
        break
    intent, response = handle_query_with_memory(user_input)
    print(f"🤖 [{intent}] {response}\n")


In [None]:
# ------------------ Imports ------------------
import json
import numpy as np
import faiss
import spacy
from spacy.matcher import Matcher
from pathlib import Path
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
from langchain.memory import ConversationBufferMemory

# ------------------ Intent Classifier ------------------
classifier = pipeline("text-classification", model="./mu_intent_model")

def classify_intent(text, threshold=0.8):
    text = text.lower().strip()
    result = classifier(text)[0]
    label = result["label"]
    score = result["score"]
    return label if score >= threshold else "open_query", score

# ------------------ Load FAQ Dataset ------------------
with open("faq_dataset.json", "r") as f:
    faq_data = json.load(f)

questions = [item["question"] for item in faq_data]
answers = [item["answer"] for item in faq_data]

# ------------------ Embed Questions in FAISS ------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2")
question_embeddings = embedder.encode(questions, convert_to_numpy=True)

dimension = question_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(question_embeddings)

# ------------------ Load FLAN-T5-XL ------------------
flan_model_name = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(flan_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(flan_model_name)

# ------------------ LangChain Memory ------------------
memory = ConversationBufferMemory(memory_key="chat_history", input_key="query")

# ------------------ SpaCy + Matcher for Entity Extraction ------------------
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matcher.add("PARENT", [[{"LOWER": {"IN": ["my", "son", "daughter", "child"]}}]])
matcher.add("STUDENT", [[{"LOWER": {"IN": ["i", "am", "student", "myself"]}}]])

def extract_admission_entities(query):
    doc = nlp(query)
    entities = {}

    # Extract via SpaCy NER
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["name"] = ent.text
        elif ent.label_.lower() not in entities:
            entities[ent.label_.lower()] = ent.text

    # Rule-based fallback for names
    lowered = query.lower()
    if "my name is" in lowered:
        name = query.split("my name is")[-1].strip().split(" ")[0]
        entities["name"] = name
    elif "i am" in lowered:
        name = query.split("i am")[-1].strip().split(" ")[0]
        entities["name"] = name

    # Role detection
    matches = matcher(doc)
    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id].lower()
        if label == "parent":
            entities["role"] = "parent"
        elif label == "student":
            entities["role"] = "student"

    # Program detection
    if "b.tech" in lowered:
        entities["program"] = "B.Tech"
    elif "m.tech" in lowered:
        entities["program"] = "M.Tech"

    return entities


# ------------------ Save Student Info to JSON ------------------
student_data_file = Path("student_info.json")

def store_student_info(entities, query):
    student_info = {
        "source_query": query,
        "role": entities.get("role", "unknown"),
        "name": entities.get("name", "unspecified"),
        "program": entities.get("program", "unspecified")
    }

    if student_data_file.exists():
        with open(student_data_file, "r") as f:
            data = json.load(f)
    else:
        data = []

    data.append(student_info)

    with open(student_data_file, "w") as f:
        json.dump(data, f, indent=4)

# ------------------ Prompt Builder ------------------
def build_prompt(intent, query, context=None):
    chat_history = memory.buffer or ""

    if intent == "greeting":
        return (
            "You are Mahindra University's friendly virtual assistant.\n"
            f"{chat_history}"
            f"User: {query}\n"
            "Reply with a warm and respectful greeting. Offer help related to admissions, courses, or hostel facilities.\n"
            "Assistant:"
        )

    elif intent == "faq" and context:
        return (
            "You are Mahindra University's helpful assistant.\n"
            f"{chat_history}"
            f"User: {query}\n"
            f"University Info: {context}\n"
            "Answer the user's question clearly, respectfully, and helpfully in your own words.\n"
            "Assistant:"
        )

    return f"{chat_history}\nUser: {query}\nAssistant:"

# ------------------ FLAN Generator ------------------
def generate_flan_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_k=5,
        top_p=0.95,
        num_beams=1
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return response.strip('"“”').strip()

# ------------------ Admission Dialogue Flow ------------------
dialogue_state = "start"
user_role = None

ADMISSION_FLOW = {
    "start": {
        "admission": {
            "response": "Great! Are you the student or a parent asking on behalf of someone?",
            "next_state": "identify_user"
        }
    },
    "identify_user": {
        "student": {
            "response": "Nice! Can I know your name and what program you're interested in?",
            "next_state": "collect_info"
        },
        "parent": {
            "response": "Sure! Please share the student's name and the program they are considering.",
            "next_state": "collect_info"
        }
    },
    "collect_info": {
        "info_provided": {
            "response": "Thanks! You can find admission eligibility and deadlines on the official portal. Would you like the link?",
            "next_state": "end"
        }
    }
}

def handle_admission_flow(intent, entities, query):
    global dialogue_state, user_role

    # Step 1: Initial Entry
    if intent == "admission" and dialogue_state == "start":
        dialogue_state = "identify_user"
        return "Great! Are you the student or a parent asking on behalf of someone?"

    # Step 2: Identify User Role
    elif dialogue_state == "identify_user":
        role = entities.get("role")
        if role:
            user_role = role  # Store for future use

        if role == "student":
            dialogue_state = "collect_info"
            return "Nice! Can I know your name and what program you're interested in?"

        elif role == "parent":
            dialogue_state = "collect_info"
            return "Sure! Please share your child’s name and the program they wish to apply for."

    # Step 3: Collect Info
    elif dialogue_state == "collect_info":
        if "name" in entities or "program" in entities:
            dialogue_state = "end"
            entities["role"] = user_role or "unknown"
            store_student_info(entities, query)
            return "Thanks! You can find admission eligibility and deadlines on the official portal. Would you like the link?"

    return None




# ------------------ Chatbot Handler ------------------
def handle_query_with_memory(query):
    global dialogue_state
    query = query.strip()
    if not query:
        return "open_query", "I'm here if you need anything related to Mahindra University."

    # ✅ Stay inside admission flow if mid-dialogue
    if dialogue_state in ["identify_user", "collect_info"]:
        lowered = query.lower()

        # 🎯 Direct role assignment if user replies with "student" or "parent"
        if lowered in ["student", "i am a student"]:
            entities = {"role": "student"}
        elif lowered in ["parent", "i am a parent"]:
            entities = {"role": "parent"}
        else:
            entities = extract_admission_entities(query)

        # 🚀 Force 'admission' intent in dialogue flow
        flow_response = handle_admission_flow("admission", entities, query)
        if flow_response:
            memory.save_context({"query": query}, {"output": flow_response})
            return "admission", flow_response

    # 🔍 Admission keyword detection (bypass intent model if obvious)
    admission_keywords = ["admission", "apply", "join", "enroll", "eligibility"]
    if any(word in query.lower() for word in admission_keywords):
        intent = "admission"
    else:
        intent, _ = classify_intent(query)

    # 🧠 Extract entities
    entities = extract_admission_entities(query)

    # 🎓 Handle Admission
    if intent == "admission":
        flow_response = handle_admission_flow(intent, entities, query)
        if flow_response:
            memory.save_context({"query": query}, {"output": flow_response})
            return intent, flow_response

    # 👋 Handle Greeting
    if intent == "greeting":
        prompt = build_prompt("greeting", query)
        bot_reply = generate_flan_response(prompt)

    # 📘 Handle FAQ
    elif intent == "faq":
        query_vec = embedder.encode([query])
        D, I = faiss_index.search(np.array(query_vec), k=1)
        idx = I[0][0]
        matched_answer = answers[idx] if idx < len(answers) else None
        prompt = build_prompt("faq", query, context=matched_answer)
        bot_reply = generate_flan_response(prompt)

    # ❓ Fallback for open queries
    else:
        bot_reply = "That sounds like an open question. Let me connect you to someone who can assist further."

    # 💾 Store memory for all outputs
    memory.save_context({"query": query}, {"output": bot_reply})
    return intent, bot_reply




# ------------------ Demo ------------------
print("🤖 MahindraGPT (Flan-T5-XL Edition) is ready. Type 'exit' to quit.\n")

while True:
    user_input = input("👤 You: ")
    if user_input.lower() == "exit":
        break
    intent, response = handle_query_with_memory(user_input)
    print(f"🤖 [{intent}] {response}\n")
