In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import json

In [14]:
data = pd.read_csv( '../data/JEE_Rank_2016_2024.csv')
data.info()
data.describe()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20833 entries, 0 to 20832
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Institute              20833 non-null  object
 1   Quota                  20833 non-null  object
 2   Gender                 20833 non-null  object
 3   Year                   20833 non-null  int64 
 4   Academic_Program_Name  20833 non-null  object
 5   Closing_Rank           20833 non-null  object
 6   Opening_Rank           20833 non-null  object
 7   Seat_Type              20833 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.3+ MB


Unnamed: 0,Institute,Quota,Gender,Year,Academic_Program_Name,Closing_Rank,Opening_Rank,Seat_Type
0,Indian Institute of Technology Bhubaneswar,AI,Gender-Neutral,2016,"Mechanical Engineering (4 Years, Bachelor of T...",651,583,ST
1,Indian Institute of Technology Bombay,AI,Gender-Neutral,2016,"Civil Engineering (4 Years, Bachelor of Techno...",21,21,OBC-NCL (PwD)
2,Indian Institute of Technology Bombay,AI,Gender-Neutral,2016,"Computer Science and Engineering (4 Years, Bac...",60,1,OPEN
3,Indian Institute of Technology Bombay,AI,Gender-Neutral,2016,Energy Engineering with M.Tech. in Energy Syst...,848,338,SC
4,Indian Institute of Technology Bombay,AI,Gender-Neutral,2016,Metallurgical Engineering and Materials Scienc...,2971,1657,OPEN


In [15]:
unique_Academic_Program_Name = data['Academic_Program_Name'].unique()
Academic_Program_Name_map = {program: idx for idx, program in enumerate(unique_Academic_Program_Name)}
Academic_Program_Name_map
data['Academic_Program_Name'] = data['Academic_Program_Name'].map(Academic_Program_Name_map)

In [16]:
unique_Institute = data['Institute'].unique()
Institute_map = {program: idx for idx, program in enumerate(unique_Institute)}
data['Institute'] = data['Institute'].map(Institute_map)
Institute_map

{'Indian Institute of Technology Bhubaneswar': 0,
 'Indian Institute of Technology Bombay': 1,
 'Indian Institute of Technology Mandi': 2,
 'Indian Institute of Technology Delhi': 3,
 'Indian Institute of Technology Kharagpur': 4,
 'Indian Institute of Technology Indore': 5,
 'Indian Institute of Technology Hyderabad': 6,
 'Indian Institute of Technology Jodhpur': 7,
 'Indian Institute of Technology Kanpur': 8,
 'Indian Institute of Technology Madras': 9,
 'Indian Institute of Technology Gandhinagar': 10,
 'Indian Institute of Technology Patna': 11,
 'Indian Institute of Technology Roorkee': 12,
 'Indian School of Mines Dhanbad': 13,
 'Indian Institute of Technology Ropar': 14,
 'Indian Institute of Technology (BHU) Varanasi': 15,
 'Indian Institute of Technology Guwahati': 16,
 'Indian Institute of Technology Bhilai': 17,
 'Indian Institute of Technology Goa': 18,
 'Indian Institute of Technology Palakkad': 19,
 'Indian Institute of Technology Jammu': 20,
 'Indian Institute of Technol

In [17]:
data['Seat'] = data['Seat_Type'].str.extract(r'^(.*?)\s*\(.*?\)$')[0].fillna(data['Seat_Type'])
data['Disability'] = data['Seat_Type'].str.contains(r'\(PwD\)').astype(int)

data = data.drop(columns=['Seat_Type'])

In [18]:
unique_Seat = data['Seat'].unique()
Seat_map = {program: idx for idx, program in enumerate(unique_Seat)}
data['Seat'] = data['Seat'].map(Seat_map)
Seat_map

{'ST': 0, 'OBC-NCL': 1, 'OPEN': 2, 'SC': 3, 'EWS': 4}

In [19]:
unique_Gender = data['Gender'].unique()
Gender_map = {program: idx for idx, program in enumerate(unique_Gender)}
data['Gender'] = data['Gender'].map(Gender_map)
Gender_map

{'Gender-Neutral': 0, 'Female-only (including Supernumerary)': 1}

In [20]:
unique_Quota = data['Quota'].unique()
Quota_map = {program: idx for idx, program in enumerate(unique_Quota)}
data = data.drop(columns=['Quota'])

In [21]:
with open('../data/maps/Institute_map.json', 'w') as f:
    json.dump(Institute_map, f)
with open('../data/maps/Gender_map.json', 'w') as f:
    json.dump(Gender_map, f)
with open('../data/maps/Academic_Program_Name.json', 'w') as f:
    json.dump(Academic_Program_Name_map, f)
with open('../data/maps/Seat_map.json', 'w') as f:
    json.dump(Seat_map, f)

In [22]:
# data = data.sample(n=100, random_state=1)

In [23]:
data['Closing_Rank'] = pd.to_numeric(data['Closing_Rank'], errors='coerce').fillna(0).astype(int)
data['Opening_Rank'] = pd.to_numeric(data['Opening_Rank'], errors='coerce').fillna(0).astype(int)

In [24]:
data

Unnamed: 0,Institute,Gender,Year,Academic_Program_Name,Closing_Rank,Opening_Rank,Seat,Disability
0,0,0,2016,0,651,583,0,0
1,1,0,2016,1,21,21,1,1
2,1,0,2016,2,60,1,2,0
3,1,0,2016,3,848,338,3,0
4,1,0,2016,4,2971,1657,2,0
...,...,...,...,...,...,...,...,...
20828,22,1,2024,0,9014,7628,1,0
20829,22,0,2024,0,2629,1327,3,0
20830,22,1,2024,0,4303,4302,3,0
20831,22,0,2024,0,1213,664,0,0


In [25]:
data.to_csv('../data/numeric_data.csv', index=False)
