In [8]:
# Student Placement Prediction Model Training
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle

# Print library versions
print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("sklearn:", sklearn.__version__)

# Load the data
data = pd.read_excel('student_placement.xlsx')
print("Data shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Basic statistics
print("\nDataset info:")
print(data.describe())

# Check target distribution
print("\nPlacement distribution:")
print(data['Placed'].value_counts())

# Split the data into features and target variable
X = data.drop('Placed', axis=1)
y = data['Placed']

print("\nFeatures:", X.columns.tolist())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform grid search with cross-validation
print("\nStarting hyperparameter tuning...")
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate the model on the test set
y_pred = best_rf.predict(X_test)

print("\nTest Set Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(best_rf, file)

print("\nModel saved as 'model.pkl'")

# Test loading the saved model
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Test prediction with a sample
sample_prediction = loaded_model.predict([[120, 7.5, 85, 80, 6.5]])
print(f"\nSample prediction for [IQ=120, CGPA=7.5, 10th=85, 12th=80, Comm=6.5]: {sample_prediction[0]}")

print("\n✅ Model training and saving completed successfully!")

numpy: 2.3.2
pandas: 2.3.1
sklearn: 1.7.1
Data shape: (102, 6)

First 5 rows:
    IQ  CGPA  10th_Marks  12th_Marks  Communication_Skills  Placed
0  114  3.14          54          97                  2.62       0
1  117  6.09          89          75                  4.56       1
2  134  9.86          73          80                  6.83       1
3  137  5.52         100          63                  6.96       1
4  137  6.37          82          58                  2.84       1

Missing values:
IQ                      0
CGPA                    0
10th_Marks              0
12th_Marks              0
Communication_Skills    0
Placed                  0
dtype: int64

Dataset info:
               IQ        CGPA  10th_Marks  12th_Marks  Communication_Skills  \
count  102.000000  102.000000  102.000000  102.000000            102.000000   
mean   124.656863    6.148529   79.637255   79.460784              5.211863   
std     11.115501    1.968004   12.769696    7.903717              1.617722   
min

