In [11]:
# train_employability_model.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')


# Load and preprocess the data
print("📊 Loading dataset...")
data = pd.read_excel("Student_Employability_dataset_2025.xlsx")
data.drop(['Name_of_Student'], axis=1, errors='ignore', inplace=True)

# Encode target label
label_encoder = LabelEncoder()
data['CLASS'] = label_encoder.fit_transform(data['CLASS'])

# Features and target
feature_columns = [col for col in data.columns if col != 'CLASS']
X = data[feature_columns]
y = data['CLASS']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Apply SMOTE
print("⚖️ Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(probability=True, class_weight='balanced', random_state=42))
])

# Grid search
param_grid = [
    {'svm__kernel': ['linear'], 'svm__C': [0.1, 1, 10]},
    {'svm__kernel': ['rbf'], 'svm__C': [0.1, 1, 10], 'svm__gamma': [0.01, 0.1, 1]},
    {'svm__kernel': ['poly'], 'svm__C': [0.1, 1], 'svm__gamma': [0.01], 'svm__degree': [2, 3]}
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_smote, y_train_smote)

# Evaluation
print("\n🎯 Best Params:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
print("\n📊 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model and metadata
joblib.dump({
    'model': grid_search.best_estimator_,
    'label_encoder': label_encoder,
    'feature_names': feature_columns
}, "employability_predictor.pkl")

print("✅ Model saved as employability_predictor.pkl")


📊 Loading dataset...
⚖️ Applying SMOTE...
Fitting 5 folds for each of 16 candidates, totalling 80 fits

🎯 Best Params: {'svm__C': 10, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}

📊 Accuracy: 0.9173184357541899

📄 Classification Report:
                 precision    recall  f1-score   support

    Employable       0.92      0.93      0.93       519
LessEmployable       0.91      0.89      0.90       376

      accuracy                           0.92       895
     macro avg       0.92      0.91      0.91       895
  weighted avg       0.92      0.92      0.92       895

📊 Confusion Matrix:
 [[485  34]
 [ 40 336]]
✅ Model saved as employability_predictor.pkl


In [12]:
print("🔍 Class distribution in full dataset:\n", y.value_counts())
print("🔍 Class distribution in test set:\n", y_test.value_counts())


🔍 Class distribution in full dataset:
 CLASS
0    1729
1    1253
Name: count, dtype: int64
🔍 Class distribution in test set:
 CLASS
0    519
1    376
Name: count, dtype: int64


In [None]:
# train_employability_model.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load and preprocess the data
print("📊 Loading dataset...")
data = pd.read_excel("Student_Employability_dataset_2025.xlsx")
data.drop(['Name_of_Student'], axis=1, errors='ignore', inplace=True)

# Encode target label
label_encoder = LabelEncoder()
data['CLASS'] = label_encoder.fit_transform(data['CLASS'])

# Features and target
feature_columns = [col for col in data.columns if col != 'CLASS']
X = data[feature_columns]
y = data['CLASS']

# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
X = selector.fit_transform(X)
selected_features = [feature_columns[i] for i in range(len(feature_columns)) if selector.variances_[i] > 0.01]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Apply SMOTETomek (oversampling + cleaning)
print("⚖️ Applying SMOTETomek...")
resampler = SMOTETomek(random_state=42)
X_train_res, y_train_res = resampler.fit_resample(X_train, y_train)

# Define pipeline with Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Grid search parameters
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X_train_res, y_train_res)

# Evaluate model
print("\n🎯 Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test)
print("\n📊 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("🔍 Predicted class distribution:\n", pd.Series(y_pred).value_counts())

# Save model and metadata
joblib.dump({
    'model': grid_search.best_estimator_,
    'label_encoder': label_encoder,
    'feature_names': selected_features
}, "employability_predictor.pkl")

print("✅ Model saved as employability_predictor.pkl")


📊 Loading dataset...
⚖️ Applying SMOTETomek...
Fitting 5 folds for each of 12 candidates, totalling 60 fits

🎯 Best Parameters: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}

📊 Accuracy: 0.8960893854748604

📄 Classification Report:
                 precision    recall  f1-score   support

    Employable       0.93      0.89      0.91       519
LessEmployable       0.85      0.91      0.88       376

      accuracy                           0.90       895
     macro avg       0.89      0.90      0.89       895
  weighted avg       0.90      0.90      0.90       895

📊 Confusion Matrix:
 [[460  59]
 [ 34 342]]
🔍 Predicted class distribution:
 0    494
1    401
Name: count, dtype: int64
✅ Model saved as employability_predictor.pkl


In [16]:
import joblib
import pandas as pd
import numpy as np

# Define the perfect student data
perfect_student_data = {
    'GENDER': 1,
    'GENERAL_APPEARANCE': 1,
    'GENERAL_POINT_AVERAGE': 1.0,
    'MANNER_OF_SPEAKING': 1,
    'PHYSICAL_CONDITION': 1,
    'MENTAL_ALERTNESS': 1,
    'SELF-CONFIDENCE': 1,
    'ABILITY_TO_PRESENT_IDEAS': 1,
    'COMMUNICATION_SKILLS': 1,
    'STUDENT_PERFORMANCE_RATING': 1,
    'NO_SKILLS': 0,
    'Year_of_Graduate': 2025
}

# Load model and metadata
model_data = joblib.load("employability_predictor.pkl")
model = model_data['model']
label_encoder = model_data['label_encoder']
feature_names = model_data['feature_names']

# Create DataFrame and match feature order
input_df = pd.DataFrame([perfect_student_data])[feature_names]

# Predict
predicted_class = model.predict(input_df)[0]
predicted_proba = model.predict_proba(input_df)[0]

# Decode label
class_name = label_encoder.inverse_transform([predicted_class])[0]

# Output
print(f"\n🎓 The student is predicted to be: **{class_name}**")
print(f"\n📈 Probability of being Employable: {predicted_proba[1]*100:.2f}%")
print(f"📉 Probability of being Less Employable: {predicted_proba[0]*100:.2f}%")



🎓 The student is predicted to be: **LessEmployable**

📈 Probability of being Employable: 78.00%
📉 Probability of being Less Employable: 22.00%


In [18]:
len(data.columns)

13