In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
import pickle 

In [7]:
final_df=pickle.load(open("data_v1.pkl","rb")) 

In [8]:
final_df

Unnamed: 0,Institute,Academic Program Name,Quota,Seat Type,Gender,Opening Rank,Closing Rank,year,round
0,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Neutral,3533.0,5947.0,2016,1
1,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Neutral,1829.0,2213.0,2016,1
2,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,SC,Neutral,663.0,1023.0,2016,1
3,Indian Institute of Technology Bhubaneswar,"Civil Engineering (4 Years, Bachelor of Techno...",AI,ST,Neutral,331.0,357.0,2016,1
4,Indian Institute of Technology Bhubaneswar,Civil Engineering and M. Tech. in Structural E...,AI,OPEN,Neutral,5408.0,6561.0,2016,1
...,...,...,...,...,...,...,...,...,...
443331,CU Jharkhand,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,OPEN,Neutral,91148,111567,2024,6
443332,CU Jharkhand,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,EWS,Neutral,18667,18876,2024,6
443333,CU Jharkhand,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,OBC-NCL,Neutral,37570,39002,2024,6
443334,CU Jharkhand,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,SC,Neutral,16473,17172,2024,6


In [9]:
final_df=final_df.dropna(subset=["Institute"])

In [11]:
final_df["Gender"]=final_df["Gender"].fillna("Neutral")

In [12]:
final_df["Gender"].value_counts()

Gender
Neutral    297288
Female     146000
Name: count, dtype: int64

In [13]:
def clean_rank(value):
    try:
        return int(float(value))
    except ValueError:
        return int(value[:-1]) if value[:-1].isdigit() else np.nan

In [14]:
final_df['Opening Rank'] = final_df['Opening Rank'].apply(clean_rank)

In [15]:
final_df['Closing Rank'] = final_df['Closing Rank'].apply(clean_rank)

In [16]:
with open("data_v2.pkl","wb") as f:
    pickle.dump(final_df,f)

In [19]:
final_df=pickle.load(open("data_v2.pkl","rb"))

In [20]:
X = final_df[['Opening Rank', 'Gender', 'Seat Type']]
y = final_df[['Institute', 'round']]


categorical_features = ['Gender', 'Seat Type']
numeric_features = ['Opening Rank']


In [32]:
le_institute = LabelEncoder()
final_df["Institute"]=le_institute.fit_transform(final_df['Institute'])

In [28]:
x

array([51, 51, 51, ...,  5,  5,  5])

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [34]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
])

In [35]:
train_data = final_df[final_df['year'] < final_df['year'].max()]
test_data = final_df[final_df['year'] == final_df['year'].max()]

In [36]:
X_train = train_data[['Opening Rank', 'Gender', 'Seat Type']]
y_train = train_data[['Institute', 'round']]
X_test = test_data[['Opening Rank', 'Gender', 'Seat Type']]
y_test = test_data[['Institute', 'round']]

In [37]:
test_data

Unnamed: 0,Institute,Academic Program Name,Quota,Seat Type,Gender,Opening Rank,Closing Rank,year,round
375162,51,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Neutral,9106,13018,2024,1
375163,51,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OPEN,Female,18286,20788,2024,1
375164,51,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Neutral,1755,1975,2024,1
375165,51,"Civil Engineering (4 Years, Bachelor of Techno...",AI,EWS,Female,3122,3308,2024,1
375166,51,"Civil Engineering (4 Years, Bachelor of Techno...",AI,OBC-NCL,Neutral,3573,4796,2024,1
...,...,...,...,...,...,...,...,...,...
443331,5,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,OPEN,Neutral,91148,111567,2024,6
443332,5,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,EWS,Neutral,18667,18876,2024,6
443333,5,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,OBC-NCL,Neutral,37570,39002,2024,6
443334,5,Integrated B. Tech.- M. Tech in Metallurgical ...,AI,SC,Neutral,16473,17172,2024,6


In [None]:
import gc

# Clear variables
del model
gc.collect()

# Now try loading the model again
import joblib
model = joblib.load('jee_model.joblib')
