In [7]:
pip install groq

Collecting groqNote: you may need to restart the kernel to use updated packages.

  Downloading groq-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.18.0-py3-none-any.whl (121 kB)
Installing collected packages: groq
Successfully installed groq-0.18.0


In [None]:
import os
import re
import pandas as pd
from groq import Groq

# --- Setup Groq client ---
api_key = os.getenv("GROQ_API_KEY") or "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"
client = Groq(api_key=api_key)

# --- Load CSV files ---
file_path_main = "OG_Placement.csv"
file_path_rank = "JOSAA AIR rank data.csv"

df_main = pd.read_csv(file_path_main)
df_rank = pd.read_csv(file_path_rank)

# Ensure rank columns are numeric
df_rank["Opening Rank"] = pd.to_numeric(df_rank["Opening Rank"], errors="coerce")
df_rank["Closing Rank"] = pd.to_numeric(df_rank["Closing Rank"], errors="coerce")
df_rank.dropna(subset=["Opening Rank", "Closing Rank"], inplace=True)

# Normalize column names and key string columns
df_rank.columns = df_rank.columns.str.strip()
df_main.columns = df_main.columns.str.strip()

df_rank["Seat Type"] = df_rank["Seat Type"].str.strip().str.lower()
df_rank["Gender"] = df_rank["Gender"].str.strip()
df_rank["Institute"] = df_rank["Institute"].str.strip().str.lower()
df_rank["Academic Program Name"] = df_rank["Academic Program Name"].str.strip().str.lower()
df_main["Institute"] = df_main["Institute"].str.strip().str.lower()
df_main["Academic Program Name"] = df_main["Academic Program Name"].str.strip().str.lower()

# Merge the datasets on common columns
df = pd.merge(df_rank, df_main, on=["Institute", "Academic Program Name"], how="inner")

# --- Compute Weighted Median CTC ---
for year in [2024, 2023, 2022]:
    df[f"Median CTC {year}"] = pd.to_numeric(df[f"Median CTC {year}"], errors="coerce")
df["Weighted Median CTC"] = (
    df["Median CTC 2024"] * 0.5 +
    df["Median CTC 2023"] * 0.3 +
    df["Median CTC 2022"] * 0.2
)
# Assume Weighted Median CTC is already in lakhs.

# --- Helper Functions for User Details ---
def map_gender(user_gender):
    user_gender = user_gender.lower()
    female_terms = ["female", "girl", "woman", "lady", "she", "her"]
    male_terms = ["male", "boy", "man", "he", "him"]
    if any(term in user_gender for term in female_terms):
        return "Female-only (including Supernumerary)"
    if any(term in user_gender for term in male_terms):
        return "Gender-Neutral"
    return "Gender-Neutral"

def map_category(user_category):
    # Normalize category mapping
    category_mapping = {"general": "open", "open": "open"}
    return category_mapping.get(user_category.lower(), user_category.lower())

def filter_colleges(category_rank, category, gender):
    # Map the provided category
    mapped_category = map_category(category.strip().lower())
    gender = map_gender(gender.strip())
    filtered_df = df[
        (df["Seat Type"] == mapped_category) &
        (df["Gender"] == gender) &
        (df["Opening Rank"] <= category_rank) &
        (df["Closing Rank"] >= category_rank)
    ]
    return filtered_df

# --- Branch Synonym Mapping ---
branch_synonyms = {
    "computer science": "computer science",
    "computer": "computer science",
    "cse": "computer science",
    "electronics": "electronics and communication",
    "electronics and communication": "electronics and communication",
    "ece": "electronics and communication",
    "electrical and electronics": "electrical and electronics",
    "electrical": "electrical and electronics",
    "eee": "electrical and electronics",
    "mechanical": "mechanical",
    "mech": "mechanical",
    "chemical": "chemical",
    "che": "chemical",
    "mnc": "mathematics and computing",
    "mathematics and computing": "mathematics and computing",
    "ai": "ai",  # for exact match
    "artificial intelligence": "ai",
    "ai related": "ai",  # treat all AI related as same category
    "physics": "physics and engineering physics",
    "engineering physics": "physics and engineering physics",
    # Also consider "circuital" as a preferred keyword.
    "circuital": "circuital"
}

branch_order = {
    "computer science": 0,
    "electronics and communication": 0,
    "electrical and electronics": 0,
    "mechanics": 1,
    "chemical": 2,
    "mathematics and computing": 3,
    "physics and engineering physics": 4
}

def canonical_branch(program_name):
    prog = program_name.lower()
    for key, canon in branch_synonyms.items():
        if key in prog:
            if canon == "circuital":
                for candidate in ["computer science", "electronics and communication", "electrical and electronics", "mathematics and computing"]:
                    if candidate in prog:
                        return candidate
            return canon
    return None

def get_branch_order(program_name):
    canon = canonical_branch(program_name)
    if canon and canon in branch_order:
        return branch_order[canon]
    return max(branch_order.values(), default=100) + 1

# --- Points Mappings for Branches and Colleges ---
points_branch_dict = {
    "computer science": 46,          # Computer science (4 year)
    "computer science (5 year)": 40,  # Note: You may need to detect 5-year CS separately if needed.
    "mnc": 35,
    "electronics and communication": 30,
    "electrical and electronics": 24,
    "chemical": 18,
    "mechanical": 17,
    "ai": 40,           # AI
    "ai related": 38,   # any AI related branch
    "statistics and data sciences": 32.5
}

points_college_dict = {
    "bombay": 77,
    "madras": 69,
    "kanpur": 62,
    "delhi": 58,
    "kharagpur": 53,
    "varanasi": 39,  # for BHU, we check if 'bhu' or 'varanasi' exists.
    "roorkee": 49,
    "guwahati": 47,
    "hyderabad": 43,
    "ism dhanbad": 35,
    "indore": 34,
    "gandhinagar": 32,
    "jodhpur": 30,
    "ropar": 29,
    "mandi": 28,
    "patna": 33,
    "bhubanshwar": 27,
    "tirupati": 25,
    "jammu": 23,
    "pallakad": 22,
    "bhilai": 21
}

def get_branch_points(canonical):
    # Return points if the branch exists in the mapping.
    if canonical in points_branch_dict:
        return points_branch_dict[canonical]
    return None

def get_college_points(institute):
    institute_lower = institute.lower()
    for key, val in points_college_dict.items():
        if key in institute_lower:
            return val
    return None

# --- Sorting Function ---
def sort_colleges(filtered_df, preference, special_branch=None):
    # If user preference is branch, we now compute the formula.
    if preference == "branch":
        # Only consider rows where the branch has a defined point.
        def compute_formula(row):
            canon = canonical_branch(row["Academic Program Name"])
            branch_pts = get_branch_points(canon)
            college_pts = get_college_points(row["Institute"])
            # Only compute if both points are available.
            if branch_pts is not None and college_pts is not None:
                # Calculate formula: 0.8*(point_branch * point_college/4) + 0.2*(Weighted Median CTC)
                return 0.8 * ((branch_pts * college_pts) / 4) + 0.2 * row["Weighted Median CTC"]
            else:
                return None

        # Compute the formula and filter out rows where formula could not be computed.
        filtered_df = filtered_df.copy()
        filtered_df["formula"] = filtered_df.apply(compute_formula, axis=1)
        filtered_df = filtered_df[filtered_df["formula"].notnull()]
        sorted_df = filtered_df.sort_values(by="formula", ascending=False)
        # Remove the formula column before display.
        sorted_df = sorted_df.drop(columns=["formula"])
        return sorted_df

    else:  # preference == "placement"
        # Sort in descending order of Weighted Median CTC.
        sorted_df = filtered_df.sort_values(by="Weighted Median CTC", ascending=False)
        return sorted_df

# --- Conversation History and Persistent State ---
conversation_history = []  # Stores conversation snippets
# current_state persists details (category_rank, category, gender, preference)
current_state = {"category_rank": None, "category": None, "gender": None, "preference": None}
special_branch_interest = None

print("Start chatting with Deepseek via Groq! Type 'exit' to end the conversation.\n")

while True:
    user_input = input("You: ")
    if user_input.strip().lower() == "exit":
        print("Conversation ended.")
        break

    conversation_history.append("User: " + user_input)
    
    try:
        # Build prompt for extraction.
        context = "\n".join(conversation_history[-5:])
        extraction_prompt = f"""
Below is the conversation so far:
{context}

Extract the following details from the latest user query.
If a detail is not mentioned, output "N/A" for that field.

Fields:
1. Category Rank (an integer, e.g., 6705)
2. Category (seat type, e.g., GENERAL, OBC, SC, etc.)
3. Gender (either MALE or indicate Female-only, e.g., FEMALE)
4. Preference (either "placement" or "branch")

Output in the exact format:
Category Rank: <value>
Category: <value>
Gender: <value>
Preference: <value>

User Query:
"{user_input}"
"""
        extraction_completion = client.chat.completions.create(
            model="deepseek-r1-distill-llama-70b",
            messages=[{"role": "user", "content": extraction_prompt}],
            temperature=0.6,
            max_completion_tokens=512,
            top_p=0.95,
            stream=False,
            stop=None,
        )
        gemini_output = extraction_completion.choices[0].message.content.strip()
        print("Deepseek Extraction Response:", gemini_output)
        conversation_history.append("Bot (extraction): " + gemini_output)
        
        # Extract details using regex.
        rank_match = re.search(r"Category Rank:\s*([0-9]+|N/A)", gemini_output, re.IGNORECASE)
        category_match = re.search(r"Category:\s*([\w\s-]+|N/A)", gemini_output, re.IGNORECASE)
        gender_match = re.search(r"Gender:\s*([\w\s-]+|N/A)", gemini_output, re.IGNORECASE)
        pref_match = re.search(r"Preference:\s*([\w\s-]+|N/A)", gemini_output, re.IGNORECASE)
        
        # Update state from extraction only if the field is provided (not N/A).
        if rank_match:
            rank_val = rank_match.group(1).strip()
            if rank_val.upper() != "N/A":
                current_state["category_rank"] = int(rank_val)
        if category_match:
            cat_val = category_match.group(1).strip()
            if cat_val.upper() == "N/A" or cat_val == "":
                current_state["category"] = None  # Will prompt later
            else:
                current_state["category"] = cat_val.upper()
        if gender_match:
            gen_val = gender_match.group(1).strip()
            if gen_val.upper() != "N/A":
                current_state["gender"] = gen_val.upper()
        if pref_match:
            pref_val = pref_match.group(1).strip().lower()
            if pref_val.upper() != "N/A":
                current_state["preference"] = pref_val
        
        # For any missing detail, prompt the user individually.
        if current_state["category_rank"] is None:
            inp = input("Please provide your Category Rank (e.g., 6705): ").strip()
            while not inp.isdigit():
                inp = input("Invalid input. Please enter a numeric Category Rank: ").strip()
            current_state["category_rank"] = int(inp)
        if current_state["category"] is None:
            inp = input("Please provide your Category (e.g., GENERAL, OBC, SC): ").strip()
            # Default to GENERAL if user enters nothing
            current_state["category"] = inp.upper() if inp.strip() != "" else "GENERAL"
        if current_state["gender"] is None:
            inp = input("Please provide your Gender (MALE/FEMALE): ").strip()
            current_state["gender"] = inp.upper()
        if current_state["preference"] is None:
            inp = input("Please provide your Preference (placement/branch): ").strip().lower()
            # Default to placement if nothing is provided
            current_state["preference"] = inp if inp != "" else "placement"
        
        # Check for special branch interest from current query.
        if "circuital" in user_input.lower():
            special_branch_interest = "circuital"
        else:
            for branch in branch_synonyms.keys():
                if branch in user_input.lower():
                    special_branch_interest = branch
                    break
            else:
                if special_branch_interest is None:
                    special_branch_interest = None
        
        print("Using details - Category Rank: {}, Category: {}, Gender: {}, Preference: {}"
              .format(current_state["category_rank"], current_state["category"],
                      current_state["gender"], current_state["preference"]))
        if special_branch_interest:
            print("Special branch interest detected:", special_branch_interest)
        conversation_history.append("Bot: Using state - " + str(current_state))
        
        # Filter colleges based on state.
        result_df = filter_colleges(current_state["category_rank"],
                                    current_state["category"],
                                    current_state["gender"])
        if result_df.empty:
            msg = "No colleges found for your criteria. Please check your details."
            print(msg)
            conversation_history.append("Bot: " + msg)
            continue
        
        sorted_df = sort_colleges(result_df, current_state["preference"], special_branch=special_branch_interest)
        # Display output according to preference.
        if current_state["preference"] == "placement":
            output = sorted_df[["Institute", "Academic Program Name", "Opening Rank", "Closing Rank", "Weighted Median CTC"]].to_string(index=False)
        else:
            output = sorted_df[["Institute", "Academic Program Name", "Opening Rank", "Closing Rank"]].to_string(index=False)
        print(output)
        conversation_history.append("Bot (final output): " + output)
    
    except Exception as e:
        error_msg = "Error: " + str(e)
        print(error_msg)
        conversation_history.append("Bot: " + error_msg)


Start chatting with Deepseek via Groq! Type 'exit' to end the conversation.



You:  my rank is 2200 and i belong to general category and i am male and i prefer branch over placement 


Deepseek Extraction Response: <think>
Okay, so I've got this user query to process. Let me read it again: "my rank is 2200 and i belong to general category and i am male and i prefer branch over placement." I need to extract specific details from this.

First, the Category Rank. The user mentioned "my rank is 2200." That seems straightforward. So, Category Rank should be 2200. Got it.

Next, the Category. They said "i belong to general category." So, the category is GENERAL. I should make sure it's in uppercase as per the example.

Then, the Gender. The user stated, "i am male." So, Gender is MALE. Again, uppercase.

Lastly, the Preference. They said, "i prefer branch over placement." So, the preference is branch, which translates to "branch" in the output. So, Preference is branch.

I think that's all the details. No need for N/A since all fields are covered.
</think>

Category Rank: 2200  
Category: GENERAL  
Gender: MALE  
Preference: branch
Using details - Category Rank: 2200, Cate

You:  my category rank is 600 and i belong to EWS category. i am a male. 


Deepseek Extraction Response: Category Rank: 600  
Category: EWS  
Gender: MALE  
Preference: N/A
Using details - Category Rank: 600, Category: EWS  
GENDER, Gender: MALE  
PREFERENCE, Preference: n
No colleges found for your criteria. Please check your details.


You:  my category rank is 600 and i belong to EWS category. i am a male. i prefer branch 


Deepseek Extraction Response: <think>
Alright, I need to extract specific details from the user's latest query. Let's break it down step by step.

First, the user mentions their category rank is 600. That's straightforward, so I'll note that as 600 under Category Rank.

Next, they state they belong to the EWS category. EWS stands for Economically Weaker Section, so that's the category.

The user also mentions they are male, so the gender field should be MALE.

Then, they say, "i prefer branch." From the context, it seems like they're indicating a preference related to their branch choice. The possible options for preference are "placement" or "branch," so this should be "branch."

I don't see any other details mentioned, so all other fields not provided by the user should be N/A, but in this case, all required fields are covered.
</think>

Category Rank: 600  
Category: EWS  
Gender: MALE  
Preference: branch
Using details - Category Rank: 600, Category: EWS  
GENDER, Gender: MALE  
PR