In [1]:
import pandas as pd
import joblib
import pickle

In [None]:
# Loading Data

df_2024 = pd.read_csv("/Users/apple/nirf_workspace/NIRF-AI-ASSISTANT/nirf-tlr-assistant/data/Engineering/2024_ss_data.csv")
df_2025 = pd.read_csv("/Users/apple/nirf_workspace/NIRF-AI-ASSISTANT/nirf-tlr-assistant/data/Engineering/2025_ss_data.csv")
print("2024 Data Shape:", df_2024.shape)
print("2025 Data Shape:", df_2025.shape)

2024 Data Shape: (100, 9)
2025 Data Shape: (100, 9)


In [3]:
# Check if columns are the same
df_2024.columns == df_2025.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True])

In [4]:
# Concatenate DataFrames 
training_df = pd.concat([df_2024, df_2025], ignore_index=True)
print(training_df.shape)
training_df.head()

(200, 9)


Unnamed: 0,Rank,University_Name,State,Year,SS_Score,Predicted_SS_Score,NT_Total,NE_Total,NP_Total
0,1,Indian Institute of Technology Madras,Tamil Nadu,2024,18.5,,7029,7817,2574
1,2,Indian Institute of Technology Delhi,Delhi,2024,18.5,,8684,8296,3807
2,3,Indian Institute of Technology Bombay,Maharashtra,2024,18.5,,6866,8531,3987
3,4,Indian Institute of Technology Kanpur,Uttar Pradesh,2024,18.33,,6825,6123,2057
4,5,Indian Institute of Technology Kharagpu,West Bengal,2024,19.76,,12130,11051,3676


In [5]:
# Check for missing values
training_df.isnull().sum()

Rank                    0
University_Name         0
State                   0
Year                    0
SS_Score                0
Predicted_SS_Score    200
NT_Total                0
NE_Total                0
NP_Total                0
dtype: int64

In [6]:
# group sizes
training_df.groupby("Year").size()

Year
2024    100
2025    100
dtype: int64

In [7]:
# feature columns and target column
feature_columns = ['NT_Total', 'NE_Total', 'NP_Total', 'Year']
target_column = 'SS_Score'

In [8]:
# Preparing Data for Modeling

X = training_df[feature_columns]
y = training_df[target_column]

In [9]:
# Model Training with GroupKFold and Cross-Validation 

from sklearn.model_selection import GroupKFold, GridSearchCV, cross_validate, cross_val_score
groups = training_df['Year']
cv = GroupKFold(n_splits=2)

In [None]:
# Gradient Boosting Regressor Pipeline and Hyperparameter Tuning

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

gbr_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("gbr", GradientBoostingRegressor(random_state=42))
    ])

gbr_param_grid = {
        "gbr__n_estimators": [200, 400, 600],
        "gbr__learning_rate": [0.03, 0.05, 0.1],
        "gbr__max_depth": [2, 3],
        "gbr__min_samples_leaf": [1, 2, 4],
        "gbr__loss": ["absolute_error", "huber"]
    }
gbr_grid = GridSearchCV(
        estimator=gbr_pipeline,
        param_grid=gbr_param_grid,
        cv=cv,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )
gbr_grid.fit(X, y, groups=groups)

best_gbr = gbr_grid.best_estimator_

cv_results = cross_validate(
    best_gbr,
    X, y, cv=cv, scoring={
        "mae": "neg_mean_absolute_error",
        "r2": "r2"}, groups=groups
    )
print("SS Best Parameters:", gbr_grid.best_params_)
print(f"MAE: {-cv_results['test_mae'].mean():.3f} "
      f"(± {cv_results['test_mae'].std():.3f})")
print(f"R²: {cv_results['test_r2'].mean():.3f} "
      f"(± {cv_results['test_r2'].std():.3f})")

SS Best Parameters: {'gbr__learning_rate': 0.1, 'gbr__loss': 'huber', 'gbr__max_depth': 2, 'gbr__min_samples_leaf': 2, 'gbr__n_estimators': 200}
MAE: 0.781 (± 0.027)
R²: 0.893 (± 0.015)


In [11]:
# Save the Model
model_path = "model/ss_score_prediction_model.pkl"
joblib.dump(best_gbr, model_path)

['model/ss_score_prediction_model.pkl']

In [12]:
# Prediction file and output file paths

# Need 2023 data for prediction

prediction_file_path = # need to fill
output_file_path = /Users/apple/nirf_workspace/NIRF-AI-ASSISTANT/nirf-tlr-assistant/data/Engineering/output_file.csv


SyntaxError: invalid syntax (3237566038.py, line 5)

In [None]:
# features for prediction
prediction_feature_columns = ['NT_Total', 'NE_Total', 'NP_Total', 'Year']
# Load the trained model
model = joblib.load(model_path)

In [None]:
# Load prediction data

prediction_df = pd.read_csv(prediction_file_path)

NameError: name 'prediction_file_path' is not defined

In [None]:
# Predicting SS Scores for New Data

X_pred = prediction_df[prediction_feature_columns]
prediction_df['Predicted_SS_Score'] = model.predict(X_pred)

NameError: name 'prediction_df' is not defined

In [None]:
# Save the predictions to CSV
prediction_df.to_csv(output_file_path, index=False)

prediction_df.head()

NameError: name 'prediction_df' is not defined