<a href="https://colab.research.google.com/github/Source-Code777/Machine_Learning_Projects/blob/main/Counselling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GATHERING DATA USING PANDAS WEB_SCRAPPER FROM WBJEE_WEBSITE**

In [None]:
import pandas as pd
url_23='https://admissions.nic.in/wbjeeb/Applicant/report/orcrreport.aspx?enc=b6w3EPyuw0C4FADZ4v1XmYUz0XFq314fzLjkE3wbM2xr/DbsjpvUS9LBCKXjSeSL'
tables_23=pd.read_html(url_23)
url_24='https://admissions.nic.in/wbjeeb/Applicant/report/orcrreport.aspx?enc=Nm7QwHILXclJQSv2YVS+7l8OpFY/O746kfneOXEneV50mv1B/txHsSKB11hFlsvw'
tables_24=pd.read_html(url_24)

**CREATING DATAFRAMES YEAR-WISE**

In [None]:
df_23=tables_23[0]
df_24=tables_24[0]

In [None]:
df_24.sample(5)

In [None]:
df_23.sample(5)

**RE-NAMING THE FEATURES **

In [None]:
def preprocess_dataframe(df, year):

    df.columns = df.columns.str.strip()
    rename_map = {
        "Institute": "College_Name",
        "Program": "Branch",
        "Quota": "Domicile",
        "Category": "Reservation"
    }

    df = df.rename(mapper=rename_map,axis=1)
    df["Year"]=year

    return df

 **ADDING YEAR COLUMN IN THE DATAFRAME**

In [None]:
df_23 = preprocess_dataframe(df_23, 2023)
df_24 = preprocess_dataframe(df_24, 2024)

**CONACATENATING THE DATA-FRAMES**

In [None]:
df=pd.concat([df_23,df_24],axis=0,ignore_index=False)

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df=df.drop(['Sr.No'],axis=1)

In [None]:
df.shape
#8093->ROWS,10->COLUMNS

In [None]:
df.isnull().sum()
#NO NULL VALUE

In [None]:
df['Reservation'].unique()

In [None]:
df['Branch'].unique()

In [None]:
#WE NEED TO REMOVE THE KEYWORD TFW FROM THE VALUES IN BRANCH COLUMN.
# BECAUSE IT IS CREATING DUPLICATE VALUES AND REGARDLESS WE ALREADY HAVE A RESERVATION COLUMN
#ALSO WE NEED TO REPLACE UNWANTED SYMBOLS
df['Branch']=df['Branch'].str.replace("TFW","",regex=False)
df['Branch']=df['Branch'].str.replace("()","",regex=False)
df['Branch']=df['Branch'].str.replace("Tfw","",regex=False)
df['Branch']=df['Branch'].str.replace("-","",regex=False)
df['Branch']=df['Branch'].str.replace(",","",regex=False)

In [None]:
df["Branch"] = (
    df["Branch"]
    .str.extract(r"\(\s*(.*?)\s*\)")[0]  # extract text inside parentheses
    .fillna(df["Branch"])                # if no parentheses, keep original
    .str.strip()                          # remove leading/trailing spaces
)

# **DATA PREPROCESSING**

In [None]:
df=df.drop(['Stream'],axis=1)
#Dropping stream because it have a single value

# **CLEANING THE PROGRAM COLUMN**

In [None]:
df["Branch"].str.replace(r"[^a-zA-Z\s]", "", regex=True)

In [None]:
df['Branch'].value_counts()

# **CREATING A MAPPING FUNCTION AND APPLYING IT ON BRANCH COLUMN**

In [None]:
import re
def Cleaning_Func(df,column,min_threshold):

  new_list=df[column]
  class_counts=new_list.value_counts()
  rare_classes = class_counts[class_counts < min_threshold].index
  other_classes=[]

  def map_value(val):
    val_lower = val.lower() # Convert to lowercase for case-insensitive matching
    if val in rare_classes:
      other_classes.append(val)
      return "other"
    elif re.search(r"\bartificial intelligence\b", val_lower):
      return "AI"
    elif re.search(r"\bmachine learning\b", val_lower):
        return "AI"
    elif re.search(r"\bcomputer science\b", val_lower):
      return "CSE"
    elif re.search(r"\biot\b|internet of things\b", val_lower):
      return "IOT"
    elif re.search(r"\bbiotech\b|biotechnology\b", val_lower):
      return "BIO-TECH"
    elif re.search(r"\belectronics\b", val_lower):
      return "ECE"
    elif re.search(r"\bcivil\b", val_lower):
      return "CIVIL"
    elif re.search(r"\bmechanical\b", val_lower):
      return "MECHANICAL"
    elif re.search(r"\bchemical\b", val_lower):
      return "CHEMICAL"
    elif re.search(r"\bproduction\b", val_lower):
      return "PRODUCTION"
    elif re.search(r"\binformation\b",val_lower):
      return "IT"
    elif re.search(r"\belectrical\b",val_lower):
      return "EE"
    else:
      return val

  df[column + "_short"] = new_list.apply(map_value)

  other_classes=list(set(other_classes))

  return df, other_classes

In [None]:
df, other_list =Cleaning_Func(df, column="Branch",min_threshold=10)
print("Values mapped to 'other':", other_list)

In [None]:
df.sample(5)

In [None]:
df=df.drop(["Branch"],axis=1)

In [None]:
#LET'S DROP THE PHRASE 'Round' from column Round as it is redundant
df['Round']=df['Round'].str.replace("Round","",)
#ALSO REMOVE COMMAS FROM COLLEGE NAME
df['College_Name']=df['College_Name'].str.replace(",","")

In [None]:
df['Seat Type'].unique()

In [None]:
df.sample(5)

In [None]:
df.isna().sum()
#NO NULL VALUES

# **VISUALIZING THE DATA**

In [None]:
#Year vs Closing_Rank outliers:

import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x="Year", y="Closing Rank", data=df)
plt.ticklabel_format(style='plain', axis='y')
plt.show()

In [None]:
#Top 10 Colleges by Median Closing Rank
plt.figure(figsize=(10,6))
top_colleges = df.groupby("College_Name")["Closing Rank"].median().sort_values().head(10)
sns.barplot(x=top_colleges.values, y=top_colleges.index)
plt.ticklabel_format(style='plain', axis='x')
plt.title("Top 10 Colleges by Median Closing Rank")
plt.show()

In [None]:
#Opening vs Closing Rank
plt.figure(figsize=(8,6))
sns.scatterplot(x="Opening Rank", y="Closing Rank", hue="Year", data=df, alpha=0.6)
plt.ticklabel_format(style='plain', axis='both')
plt.title("Opening vs Closing Rank")
plt.show()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,6))
branch_rank = df.groupby("Branch_short")["Closing Rank"].mean().sort_values()
sns.barplot(x=branch_rank.values, y=branch_rank.index)
plt.ticklabel_format(style='plain', axis='x')
plt.title("Average Closing Rank by Branch")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="Year", hue="Branch_short", data=df, palette="Set2")
plt.title("Branch_Demand_Year_wise")
plt.legend(bbox_to_anchor=(1,1.05), loc='upper left')
plt.show()

# **LET'S DETECT OUTLIERS **

In [None]:
import numpy as np

def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[column] < lower) | (df[column] > upper)]
    return outliers, lower, upper

outliers_open, low_open, up_open = detect_outliers(df, "Opening Rank")
outliers_close, low_close, up_close = detect_outliers(df, "Closing Rank")

print("Opening Rank outliers:", len(outliers_open))
print("Closing Rank outliers:", len(outliers_close))

In [None]:
df.info()

# **CLEANING THE VALUES IN THE COLUMN AND CLASSIFYING THE COLLEGE'S AS GOVERMENT AND PRIVATE**

In [None]:
import pandas as pd
import re

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")

df["college_name"] = (
    df["college_name"]
    .str.replace("Govt.", "Government", regex=False)
    .str.replace("Goverment", "Government", regex=False)
    .str.replace("&", "and", regex=False)
    .str.replace(r"\.\.\.$", "", regex=True)
    .str.title()
)

df["seat_type"] = df["seat_type"].str.replace(" Seats", "", regex=False)
df["domicile"] = df["domicile"].replace({"Home State": "Home", "All India": "AI"})
df["reservation"] = df["reservation"].replace({"Tuition Fee Waiver": "TFW"})

gov_keywords = ["government", "govt", "university of calcutta", "calcutta university",
                "jadavpur university", "presidency university", "makaut", "wbut",
                "kalyani university", "burdwan university", "vidyasagar university",
                "north bengal university", "west bengal state university", "aliah university"]

def classify(name):
    s = str(name).lower()
    return "Government" if any(g in s for g in gov_keywords) else "Private"

df["college_type"] = df["college_name"].apply(classify)

print("df cleaned and classified")

CLASSIFYING COLLEGES INTO HARD MEDIUM AND EASY BASED ON CLOSING RANK

In [None]:
df.sample(5)

In [None]:
def difficulty_label(df, total_seats=52000, new_col="difficulty_level"):
    df = df.copy()
    df["closing_norm"] = df["closing_rank"] / total_seats

    conditions = [
        df["closing_norm"] <= 0.2,   # top 20% ranks
        (df["closing_norm"] > 0.2) & (df["closing_norm"] <= 0.6),
        df["closing_norm"] > 0.6
    ]
    choices = ["Difficult", "Medium", "Easy"]

    df[new_col] = np.select(conditions, choices, default="Unknown")
    return df


In [None]:
df =difficulty_label(df)

In [None]:
df.sample(5)

LET'S USE PANDAS PROFILER

In [None]:
!pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

In [None]:
profile.to_notebook_iframe()

# **LETS DO SOME PRE-PROCESSING**

In [None]:
df.sample(5)

In [None]:
categorical_columns=["seat_type","domicile","reservation","college_type","difficulty_level"]
numerical_columns=["round", "opening_rank", "closing_rank", "year", "closing_norm"]

In [None]:
#Importing all necessary modules:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
X = df.drop(["college_name", "branch_short"], axis=1)
Y_college = df["college_name"]
Y_branch = df["branch_short"]

In [None]:
preprocessor_softmax = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("num", StandardScaler(), numerical_columns)
    ]
)

In [None]:
#College_name_pipeline
college_model = Pipeline([
    ("preprocessor", preprocessor_softmax),
    ("classifier", LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    ))
])

In [None]:
#Branch_pipeline
branch_model = Pipeline([
    ("preprocessor", preprocessor_softmax),
    ("classifier", LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    ))
])

In [None]:
#Splitting data for college model
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, Y_college, test_size=0.2, stratify=Y_college, random_state=42)

In [None]:
#Splitting data for branch model
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X,Y_branch, test_size=0.2, stratify=Y_branch, random_state=42)

In [None]:
#Fitting data for both models
college_model.fit(X_train_c, y_train_c)
branch_model.fit(X_train_b, y_train_b)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_c = college_model.predict(X_test_c)
print("College Prediction Accuracy:", accuracy_score(y_test_c, y_pred_c))

In [None]:
from sklearn.metrics import classification_report

y_pred_c = college_model.predict(X_test_c)
report_c = classification_report(y_test_c, y_pred_c, zero_division=0, output_dict=True)
weighted_precision_c = report_c['weighted avg']['precision']
print(f"Weighted Precision for College Model: {weighted_precision_c:.4f}")

In [None]:
print(classification_report(y_test_c, y_pred_c, zero_division=0))

In [None]:
y_pred_b = branch_model.predict(X_test_b)
print("Branch Prediction Accuracy:", accuracy_score(y_test_b, y_pred_b))

In [None]:
print(classification_report(y_test_b, y_pred_b, zero_division=0))

In [None]:
new_data = pd.DataFrame([{
    "round": 2,
    "seat_type": "WBJEE",
    "domicile": "AI",
    "reservation": "Open",
    "opening_rank": 5000,
    "closing_rank": 20000,
    "year": 2024,
    "college_type": "Private",
    "closing_norm": 1.5,
    "difficulty_level": "Medium"
}])

print("Predicted College:", college_model.predict(new_data)[0])
print("Predicted Branch:", branch_model.predict(new_data)[0])

In [None]:
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__solver": ["lbfgs", "saga"],
    "classifier__penalty": ["l2"]
}

In [None]:
grid_college = GridSearchCV(
    college_model,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    verbose=1
)

grid_college.fit(X_train_c, y_train_c)

print("Best College Model Params:", grid_college.best_params_)
print("Best College CV Accuracy:", grid_college.best_score_)

In [None]:
grid_branch = GridSearchCV(
    branch_model,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    verbose=1
)

grid_branch.fit(X_train_b, y_train_b)

print("Best Branch Model Params:", grid_branch.best_params_)
print("Best Branch CV Accuracy:", grid_branch.best_score_)

In [None]:
# College
best_college = grid_college.best_estimator_
y_pred_c = best_college.predict(X_test_c)
print("Test Accuracy (College):", accuracy_score(y_test_c, y_pred_c))

# Branch
best_branch = grid_branch.best_estimator_
y_pred_b = best_branch.predict(X_test_b)
print("Test Accuracy (Branch):", accuracy_score(y_test_b, y_pred_b))

In [None]:
preprocessor_random = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("num","passthrough", numerical_columns)
    ]
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

branch_rf = Pipeline([
    ("preprocessor", preprocessor_random),
    ("classifier", RandomForestClassifier(
        n_estimators=300, max_depth=20, random_state=42, n_jobs=-1
    ))
])

branch_rf.fit(X_train_b, y_train_b)
print("RF Branch Accuracy:", branch_rf.score(X_test_b, y_test_b))


In [None]:
college_rf = Pipeline([
    ("preprocessor", preprocessor_random),
    ("classifier", RandomForestClassifier(
        n_estimators=300, max_depth=20, random_state=42, n_jobs=-1
    ))
])
college_rf.fit(X_train_c, y_train_c)
print("RF College Accuracy:", college_rf.score(X_test_c, y_test_c))

In [None]:
from sklearn.metrics import top_k_accuracy_score

y_proba = college_rf.predict_proba(X_test_c)
print("Top-3 College Accuracy:", top_k_accuracy_score(y_test_c, y_proba, k=3, labels=college_rf.named_steps['classifier'].classes_))

In [None]:
from sklearn.metrics import top_k_accuracy_score

# Example for College
y_proba_c = college_model.predict_proba(X_test_c)
print("College Top-5 Accuracy:", top_k_accuracy_score(y_test_c, y_proba_c, k=5, labels=college_model.named_steps['classifier'].classes_))

# Example for Branch
y_proba_b = branch_model.predict_proba(X_test_b)
print("Branch Top-5 Accuracy:", top_k_accuracy_score(y_test_b, y_proba_b, k=5, labels=branch_model.named_steps['classifier'].classes_))

WELL THE MODEL SUCK'S LITERALLY BOTH OF THEM LETS TRY A DIFFERENT APPROACH WE WILL TRAIN A DIFFERENT APPROACH