In [11]:
# Cell 1: Environment, config, and imports

# Core
import os
import gc
import json
import math
import warnings
warnings.filterwarnings("ignore")

# Data
import numpy as np
import pandas as pd

# Sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import FeatureHasher
from sklearn.base import BaseEstimator, TransformerMixin, clone

# Persistence
import joblib

# Reproducibility
RANDOM_STATE = 42

# Paths
DATA_PATH = "comedk_features_core.csv"  # ensure this file is in the working directory
ARTIFACT_DIR = "artifacts_fe"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# Target assumption (change if modeling a different target)
TARGET_COL = "Cutoff_Rank"  # common use case: predict rank cutoffs
SEED = RANDOM_STATE

print("Setup complete.")


Setup complete.


In [12]:
# Cell 2: Load data and basic sanity checks

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist()[:20], "...")  # preview first 20
print(df.head(3))
print(df.dtypes.value_counts())

# Basic strip and normalize object columns
obj_cols = df.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    df[c] = df[c].astype(str).str.strip()

# Remove exact duplicate rows (if any)
dup_count = df.duplicated().sum()
if dup_count:
    df = df.drop_duplicates().reset_index(drop=True)
    print(f"Dropped duplicates: {dup_count}")

# Optional: ensure target presence
if TARGET_COL not in df.columns:
    raise ValueError(f"Configured TARGET_COL='{TARGET_COL}' not found in data columns.")


Shape: (2272, 38)
Columns: ['College_Code', 'College_Name', 'Category', 'Branch', 'Cutoff_Rank', 'Year', 'Round', 'Exam_Type', 'Rank_Category', 'City', 'Min_Round', 'Max_Round', 'Round_Count', 'Best_Rank', 'Worst_Rank', 'Avg_Rank_Combo', 'Rank_Range', 'Trend_Signal', 'CTX_Seats', 'CTX_Avg_Rank'] ...
  College_Code                                       College_Name Category  \
0         E001  Acharya Institute of Technology-Soladevanahall...       GM   
1         E001  Acharya Institute of Technology-Soladevanahall...       GM   
2         E001  Acharya Institute of Technology-Soladevanahall...      KKR   

                                          Branch  Cutoff_Rank  Year  Round  \
0                    AE-Aeronautical Engineering        30316  2024      1   
1  AI-Artificial Intelligence & Machine Learning        18743  2024      1   
2                    AE-Aeronautical Engineering        69087  2024      1   

  Exam_Type Rank_Category       City  ...     Branch_TE  City_Avg_Rank  \

In [14]:
# Cell 3: Clean → Filter (COMEDK 2024 Round-1 GM) → CS-related summary

import pandas as pd
import numpy as np

# 1) Load
df = pd.read_csv("comedk_features_core.csv")

# 2) Basic cleaning
for col in ["College_Code","College_Name","Category","Branch","Exam_Type","Rank_Category","City"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

# Safe numeric coercions
num_cols = ["Cutoff_Rank","Year","Round","Best_Rank","Worst_Rank","Rank_Percentile"]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Derive short Branch_Code and canonical CS-related flag
def get_branch_code(branch):
    if pd.isna(branch):
        return np.nan
    if isinstance(branch, (list, tuple)):
        branch = branch[0] if branch else np.nan
        if pd.isna(branch):
            return np.nan
    # partition avoids building a list; takes the part before the first hyphen
    return str(branch).partition('-')[0].strip()


df["Branch_Code"] = (
    df["Branch"]
      .astype("string")
      .str.split('-', n=1)
      .str[0]
      .str.strip()
)


cs_like_codes = {
    "CS","IS","AI","AD","CD","CI","CB","CG","CE","CN","CNW","CY","IC","CSD","COS","CST","CO"
}
df["Is_CS_Related"] = df["Branch_Code"].isin(cs_like_codes)

# 3) Filter to COMEDK 2024 Round-1 GM
mask = (
    (df.get("Year", np.nan) == 2024) &
    (df.get("Exam_Type", "") == "COMEDK") &
    (df.get("Round", np.nan) == 1) &
    (df.get("Category", "") == "GM")
)
base = df.loc[mask].copy()

# 4) Subset to CS-related
cs_df = base.loc[base["Is_CS_Related"] == True].copy()

# 5) Summary: per college-branch
agg_map = {
    "Cutoff_Rank": ["min","max","mean","count"],
}
if "Rank_Percentile" in cs_df.columns:
    agg_map["Rank_Percentile"] = ["mean"]

summary = (
    cs_df
    .groupby(["College_Name","City","Branch","Branch_Code"], dropna=False)
    .agg(agg_map)
)

# Flatten MultiIndex columns
summary.columns = ["_".join([c for c in col if c]) for col in summary.columns.values]
summary = summary.reset_index()

# Sort by best (min) Cutoff_Rank asc
summary = summary.sort_values(by="Cutoff_Rank_min", ascending=True)

# 6) State-wide top 25 CS rows
top25_state = summary.head(25)

# 7) Bengaluru-only top 25 (optional)
blr_top25 = (
    summary.loc[summary["City"].str.lower() == "bangalore"]
    .head(25)
)

# 8) Display key outputs
print("CS-related | COMEDK 2024 | Round-1 | GM | rows:", len(cs_df))
print(top25_state[["College_Name","City","Branch","Cutoff_Rank_min","Cutoff_Rank_mean","Cutoff_Rank_max"]].to_string(index=False))

print("\nBengaluru-only Top 25:")
print(blr_top25[["College_Name","City","Branch","Cutoff_Rank_min","Cutoff_Rank_mean","Cutoff_Rank_max"]].to_string(index=False))

print(df["Branch"].head(10).to_list())
print(sorted(df["Branch_Code"].dropna().unique())[:30])


CS-related | COMEDK 2024 | Round-1 | GM | rows: 472
                                                                                                                     College_Name      City                                                                                    Branch  Cutoff_Rank_min  Cutoff_Rank_mean  Cutoff_Rank_max
                                                                                R V College of Engineering-Mysore Road, Bengaluru Bangalore                                                         CS-Computer Science & Engineering              193             193.0              193
                                                                                R V College of Engineering-Mysore Road, Bengaluru Bangalore                                          CD-Computer Science & Engineering (Data Science)              274             274.0              274
                                                                                R V College of Enginee

In [17]:
# Feature engineering: leakage-safe categorical encodings + numeric preprocessing

import numpy as np
import pandas as pd

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Ensure transformed outputs come back as pandas DataFrames with column names
set_config(transform_output="pandas")

# ---- NORMALIZE COLUMN NAMES ----
# Strip whitespace, lowercase, replace internal spaces with underscores
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"\s+", "_", regex=True)
)

# ---- CONFIG ----
# Default expected target; will validate and auto-select if necessary
target_col = "closing_rank"

# ---- ROBUST TARGET SELECTION ----
if target_col not in df.columns:
    # Prefer 'cutoff_rank' if present
    if "cutoff_rank" in df.columns:
        target_col = "cutoff_rank"
    else:
        # Find all columns mentioning 'rank'
        rank_like = df.filter(regex=r"rank", axis=1).columns.tolist()

        # Exclude likely non-target descriptors
        exclude_keywords = ("percentile", "band", "category", "range")
        rank_like = [c for c in rank_like if not any(k in c for k in exclude_keywords)]

        # Keep only numeric candidates
        rank_like_numeric = [c for c in rank_like if pd.api.types.is_numeric_dtype(df[c])]

        # Priority 1: names containing 'closing' or 'cutoff'
        preferred = [c for c in rank_like_numeric if ("closing" in c or "cutoff" in c)]

        # Choose by priority, else by most non-missing values
        if len(preferred) >= 1:
            target_col = preferred
        elif len(rank_like_numeric) >= 1:
            non_null_counts = {c: df[c].notna().sum() for c in rank_like_numeric}
            target_col = max(non_null_counts, key=non_null_counts.get)
        else:
            raise KeyError(
                f"Target column not found. Rank-like numeric candidates detected: {rank_like_numeric}. "
                "Please set target_col explicitly."
            )

print(f"Using target column: {target_col}")

# ---- COLUMN DETECTION ----
# After target resolution, compute selectors
numeric_cols = (
    df.select_dtypes(include=[np.number])
      .columns.drop([target_col], errors="ignore")
      .tolist()
)

categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Split categorical features by cardinality threshold
if len(categorical_cols) > 0:
    cardinality = df[categorical_cols].nunique()
else:
    cardinality = pd.Series(dtype=int)

high_card_threshold = 15  # tweak if needed
high_card_cols = cardinality[cardinality > high_card_threshold].index.tolist()
low_card_cols  = [c for c in categorical_cols if c not in high_card_cols]

# ---- PREPROCESSORS ----
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

low_cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# TargetEncoder with internal cross-fitting to reduce leakage
high_cat_enc = TargetEncoder(smooth="auto", cv=5, random_state=42)

# Build transformers list only with non-empty selections
transformers = []
if len(numeric_cols) > 0:
    transformers.append(("num", num_pipe, numeric_cols))
if len(low_card_cols) > 0:
    transformers.append(("low_cat", low_cat_pipe, low_card_cols))
if len(high_card_cols) > 0:
    transformers.append(("high_cat", high_cat_enc, high_card_cols))

preprocess = ColumnTransformer(
    transformers=transformers,
    remainder="drop",
    verbose_feature_names_out=False,
)

# ---- SPLIT, FIT, TRANSFORM ----
X = df.drop(columns=[target_col])  # succeeds after target detection/cleaning
y = df[target_col]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit on train (y is passed to TargetEncoder), transform both
X_train_fe = preprocess.fit_transform(X_train, y_train)
X_valid_fe = preprocess.transform(X_valid)

# Optional: persist engineered features
X_train_fe.to_csv("X_train_features.csv", index=False)
X_valid_fe.to_csv("X_valid_features.csv", index=False)

# If needed later:
# feature_names = X_train_fe.columns.tolist()


Using target column: cutoff_rank


ValueError: n_splits=5 cannot be greater than the number of members in each class.