In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# ==========================================
# 1. DATASET GENERATION
# ==========================================
def generate_dataset(filename='placement_data.csv', num_samples=1000):
    """
    Generates a synthetic dataset for student placement prediction.
    """
    np.random.seed(42)

    departments = ['CSE', 'IT', 'ECE', 'MECH', 'CIVIL']
    skill_stacks = ['Web Development', 'Data Science', 'Cloud', 'Core']

    data = {
        'student_id': range(1, num_samples + 1),
        'department': np.random.choice(departments, num_samples),
        'cgpa': np.random.uniform(5.0, 10.0, num_samples),
        'active_backlogs': np.random.randint(0, 5, num_samples),
        'skill_stack': np.random.choice(skill_stacks, num_samples),
        'internships': np.random.randint(0, 4, num_samples),
        'projects': np.random.randint(0, 10, num_samples),
        'aptitude_score': np.random.randint(30, 100, num_samples),
        'communication_score': np.random.randint(1, 11, num_samples),
    }

    df = pd.DataFrame(data)

    # Introduce some missing values to test imputation
    df.loc[df.sample(frac=0.05).index, 'cgpa'] = np.nan
    df.loc[df.sample(frac=0.05).index, 'communication_score'] = np.nan

    # Create a logic for 'placed' target to make the dataset learnable
    # (High CGPA + Good Aptitude + No Backlogs = High Chance)
    score = (
        (df['cgpa'] * 10) +
        (df['aptitude_score'] * 0.5) +
        (df['communication_score'] * 5) -
        (df['active_backlogs'] * 20)
    )

    # Threshold for placement, adding some randomness
    df['placed'] = (score + np.random.normal(0, 10, num_samples) > 110).astype(int)

    df.to_csv(filename, index=False)
    print(f"âœ… Dataset generated and saved as '{filename}'")
    return df

# ==========================================
# 2. PREPROCESSING & FEATURE ENGINEERING
# ==========================================
def preprocess_data(df):
    # Drop student_id as it is not a feature
    X = df.drop(['student_id', 'placed'], axis=1)
    y = df['placed']

    # Identify categorical and numerical columns
    categorical_cols = ['department', 'skill_stack']
    numerical_cols = ['cgpa', 'active_backlogs', 'internships', 'projects', 'aptitude_score', 'communication_score']

    # --- Handling Missing Values ---
    # Impute Numerical with Mean
    num_imputer = SimpleImputer(strategy='mean')
    X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])

    # Impute Categorical with Mode
    cat_imputer = SimpleImputer(strategy='most_frequent')
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

    # --- Encoding ---
    # Use One-Hot Encoding for categorical features
    # sparse_output=False returns a numpy array which is easier to concatenate later
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_encoded = encoder.fit_transform(X[categorical_cols])

    # Get feature names for encoded columns
    encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

    # --- Scaling ---
    # Apply StandardScaler to numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X[numerical_cols])

    # Combine processed numerical and categorical features
    X_processed = np.hstack((X_scaled, X_encoded))

    # Create a list of all column names for reference (optional)
    all_feature_names = list(numerical_cols) + list(encoded_feature_names)

    return X_processed, y, encoder, scaler, all_feature_names

# ==========================================
# 3. MODEL TRAINING & 4. EVALUATION
# ==========================================
def train_and_evaluate(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }

    results = []
    best_model = None
    best_score = 0

    print("\nðŸš€ Training Models...")
    print("-" * 60)

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        # Select best model based on F1-Score (balances precision and recall)
        if f1 > best_score:
            best_score = f1
            best_model = model

        print(f"{name}: âœ… Trained | F1: {f1:.4f}")

    # Create Comparison Report
    results_df = pd.DataFrame(results).sort_values(by="F1-Score", ascending=False)
    print("\nðŸ“Š Model Comparison Report:")
    print(results_df.to_string(index=False))

    return best_model

# ==========================================
# 5. SERIALIZATION
# ==========================================
def save_artifacts(model, scaler, encoder):
    joblib.dump(model, 'placement_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(encoder, 'encoder.pkl')
    print("\nðŸ’¾ Artifacts saved successfully (placement_model.pkl, scaler.pkl, encoder.pkl)")

# ==========================================
# 6. PREDICTION FUNCTION
# ==========================================
def predict_placement(student_data):
    """
    Predicts placement status for a new student.
    student_data: Dictionary or List of Dictionaries
    """
    try:
        # Load saved artifacts
        model = joblib.load('placement_model.pkl')
        scaler = joblib.load('scaler.pkl')
        encoder = joblib.load('encoder.pkl')

        # Convert input to DataFrame
        input_df = pd.DataFrame(student_data)

        # Reconstruct column lists (must match training)
        numerical_cols = ['cgpa', 'active_backlogs', 'internships', 'projects', 'aptitude_score', 'communication_score']
        categorical_cols = ['department', 'skill_stack']

        # 1. Handle Missing Values (using SimpleImputer logic manually or pre-fit imputers)
        # For simplicity here, we fill with 0 if missing, or we could load saved imputers.
        # Assuming saved imputers for robust production code:
        # Here we assume input is complete for brevity, but fill NaN just in case.
        input_df[numerical_cols] = input_df[numerical_cols].fillna(0)
        input_df[categorical_cols] = input_df[categorical_cols].fillna('Unknown')

        # 2. Scale Numerical
        input_scaled = scaler.transform(input_df[numerical_cols])

        # 3. Encode Categorical
        input_encoded = encoder.transform(input_df[categorical_cols])

        # 4. Combine
        input_processed = np.hstack((input_scaled, input_encoded))

        # 5. Predict
        prediction_class = model.predict(input_processed)
        prediction_prob = model.predict_proba(input_processed)[:, 1] # Probability of Class 1

        results = []
        for i in range(len(student_data)):
            results.append({
                "Input": student_data[i],
                "Prediction": "Placed" if prediction_class[i] == 1 else "Not Placed",
                "Probability": f"{prediction_prob[i] * 100:.2f}%"
            })

        return results

    except Exception as e:
        return f"Error during prediction: {str(e)}"

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    # Step 1: Generate Data
    df = generate_dataset()

    # Step 2: Preprocess
    X, y, encoder, scaler, feature_names = preprocess_data(df)

    # Split Data (80% Train, 20% Test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 3 & 4: Train and Evaluate
    best_model = train_and_evaluate(X_train, X_test, y_train, y_test)

    # Step 5: Save Model
    save_artifacts(best_model, scaler, encoder)

    # Step 6: Test Prediction Function with a new student
    print("\nðŸ”® Testing Prediction Function...")

    new_student = [{
        'department': 'CSE',
        'cgpa': 8.5,
        'active_backlogs': 0,
        'skill_stack': 'Web Development', # Added closing quote
        'internships': 2,
        'projects': 5,
        'aptitude_score': 90,
        'communication_score': 9
    },{
        'department': 'MECH',
        'cgpa': 6.2,
        'active_backlogs': 1,
        'skill_stack': 'Core',
        'internships': 0,
        'projects': 2,
        'aptitude_score': 65,
        'communication_score': 6
    }]

    predictions = predict_placement(new_student)
    print(predictions)

âœ… Dataset generated and saved as 'placement_data.csv'

ðŸš€ Training Models...
------------------------------------------------------------
Logistic Regression: âœ… Trained | F1: 0.8527
Decision Tree: âœ… Trained | F1: 0.7761
Random Forest: âœ… Trained | F1: 0.8618

ðŸ“Š Model Comparison Report:
              Model  Accuracy  Precision   Recall  F1-Score
      Random Forest     0.915   0.868852 0.854839  0.861789
Logistic Regression     0.905   0.820896 0.887097  0.852713
      Decision Tree     0.850   0.722222 0.838710  0.776119

ðŸ’¾ Artifacts saved successfully (placement_model.pkl, scaler.pkl, encoder.pkl)

ðŸ”® Testing Prediction Function...
[{'Input': {'department': 'CSE', 'cgpa': 8.5, 'active_backlogs': 0, 'skill_stack': 'Web Development', 'internships': 2, 'projects': 5, 'aptitude_score': 90, 'communication_score': 9}, 'Prediction': 'Placed', 'Probability': '88.00%'}, {'Input': {'department': 'MECH', 'cgpa': 6.2, 'active_backlogs': 1, 'skill_stack': 'Core', 'internships': 0,