In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, brier_score_loss, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = 'rank_uni.csv' 
data = pd.read_csv(file_path)

# Data Preparation

median_score = data['score'].median()
data['score_binary'] = (data['score'] >= median_score).astype(int)
data['broad_impact'] = data['broad_impact'].fillna(data['broad_impact'].median())

In [2]:
# Define predictors and target
predictors = ['quality_of_education', 'alumni_employment', 'quality_of_faculty',
              'publications', 'influence', 'citations', 'broad_impact', 'patents']
target = 'score_binary'

X = data[predictors]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


In [3]:
# Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "L1 LASSO": LogisticRegression(penalty='l1', solver='liblinear', random_state=42),
    "L2 Ridge": LogisticRegression(penalty='l2', solver='lbfgs', random_state=42),
    "Elastic Net": LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42),
    "Support Vector Machine": Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC(probability=True, kernel='rbf', random_state=42))
    ]),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

In [7]:
# Evaluate Models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    brier = brier_score_loss(y_test, y_pred_proba) if y_pred_proba is not None else None
    report = classification_report(y_test, y_pred, output_dict=True)
    return {
        "AUC": auc,
        "Brier Score": brier,
        "Precision": report['1']['precision'],
        "Recall": report['1']['recall'],
        "F1-Score": report['1']['f1-score'],
        "Accuracy": report['accuracy']
    }


In [9]:
# Evaluate all models
results = []
for name, model in models.items():
    try:
        results.append({"Model": name, **evaluate_model(model, X_train, X_test, y_train, y_test)})
    except Exception as e:
        results.append({"Model": name, "Error": str(e)})

# Display Results
results_df = pd.DataFrame(results)
print(results_df)

                    Model       AUC  Brier Score  Precision    Recall  \
0     Logistic Regression  0.926724     0.116435   0.776699  0.919540   
1                L1 LASSO  0.929075     0.115537   0.784314  0.919540   
2                L2 Ridge  0.926724     0.116435   0.776699  0.919540   
3             Elastic Net  0.815047     0.176412   0.734940  0.701149   
4  Support Vector Machine  0.953501     0.096084   0.792079  0.919540   
5           Random Forest  0.981518     0.059050   0.927711  0.885057   

   F1-Score  Accuracy  
0  0.842105  0.828571  
1  0.846561  0.834286  
2  0.842105  0.828571  
3  0.717647  0.725714  
4  0.851064  0.840000  
5  0.905882  0.908571  


In [11]:
pip install --upgrade lazypredict scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp39-cp39-win_amd64.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp39-cp39-win_amd64.whl (11.0 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
Successfully installed scikit-learn-1.5.2
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.


In [12]:
!pip install scikit-learn==1.0.2

Collecting scikit-learn==1.0.2
  Using cached scikit_learn-1.0.2-cp39-cp39-win_amd64.whl.metadata (10 kB)
Using cached scikit_learn-1.0.2-cp39-cp39-win_amd64.whl (7.2 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.2
    Uninstalling scikit-learn-1.5.2:
      Successfully uninstalled scikit-learn-1.5.2
Successfully installed scikit-learn-1.0.2


In [13]:
pip uninstall -y lazypredict scikit-learn


Found existing installation: lazypredict 0.2.13
Uninstalling lazypredict-0.2.13:
  Successfully uninstalled lazypredict-0.2.13
Found existing installation: scikit-learn 1.0.2
Uninstalling scikit-learn-1.0.2:
  Successfully uninstalled scikit-learn-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [14]:
!pip install scikit-learn==1.0.2
!pip install lazypredict


Collecting scikit-learn==1.0.2
  Using cached scikit_learn-1.0.2-cp39-cp39-win_amd64.whl.metadata (10 kB)
Using cached scikit_learn-1.0.2-cp39-cp39-win_amd64.whl (7.2 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.0.2
Collecting lazypredict
  Using cached lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Using cached lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [18]:
from lazypredict.Supervised import LazyClassifier

# Lazy Predict setup
lazy_clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit LazyClassifier on the dataset
models, predictions = lazy_clf.fit(X_train, X_test, y_train, y_test)

# Display the results
print("All Model Results from Lazy Predict:")
print(models)

# Filter models with accuracy > 90%
high_accuracy_models = models[models['Accuracy'] > 0.90]
if not high_accuracy_models.empty:
    print("\nModels with Accuracy > 90%:")
    print(high_accuracy_models)
else:
    print("\nNo additional models with Accuracy > 90% were found.")



TypeError: __init__() got an unexpected keyword argument 'sparse_output'

In [22]:
# Importing required libraries for additional classifiers
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# new models to evaluate
additional_models = {
    "Bagging Classifier": BaggingClassifier(random_state=42),
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
    "LGBM Classifier": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "XGB Classifier": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

In [24]:
# Evaluate the additional models
additional_results = []
for name, model in additional_models.items():
    try:
        additional_results.append({"Model": name, **evaluate_model(model, X_train, X_test, y_train, y_test)})
    except Exception as e:
        additional_results.append({"Model": name, "Error": str(e)})

# Compile results into a DataFrame
additional_results_df = pd.DataFrame(additional_results)

# Display the results locally
print(additional_results_df)

[LightGBM] [Info] Number of positive: 263, number of negative: 262
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 934
[LightGBM] [Info] Number of data points in the train set: 525, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500952 -> initscore=0.003810
[LightGBM] [Info] Start training from score 0.003810
                      Model  AUC  Brier Score  Precision  Recall  F1-Score  \
0        Bagging Classifier 0.99         0.05       0.97    0.89      0.93   
1    Extra Trees Classifier 0.98         0.06       0.95    0.89      0.92   
2             Random Forest 0.98         0.06       0.93    0.89      0.91   
3       AdaBoost Classifier 0.96         0.19       0.94    0.87      0.90   
4           LGBM Classifier 0.99         0.06       0.95    0.