In [5]:
# Decision Tree Implementation - From Training to Production

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib  # For saving/loading models
import numpy as np

# ========== STEP 1: TRAINING PHASE ==========
print("=== TRAINING PHASE ===")

# Sample training data - Student performance prediction
data = {
    'study_hours': [2, 8, 5, 1, 9, 3, 7, 4, 6, 10],
    'attendance': [60, 95, 80, 40, 98, 65, 90, 75, 85, 100],
    'previous_grade': [70, 85, 78, 45, 92, 68, 88, 72, 82, 95],
    'result': ['Fail', 'Pass', 'Pass', 'Fail', 'Pass', 'Fail', 'Pass', 'Pass', 'Pass', 'Pass']
}

df = pd.DataFrame(data)
print("Training Data:")
print(df)
print()

=== TRAINING PHASE ===
Training Data:
   study_hours  attendance  previous_grade result
0            2          60              70   Fail
1            8          95              85   Pass
2            5          80              78   Pass
3            1          40              45   Fail
4            9          98              92   Pass
5            3          65              68   Fail
6            7          90              88   Pass
7            4          75              72   Pass
8            6          85              82   Pass
9           10         100              95   Pass



In [7]:
# Prepare features (X) and target (y)
X = df[['study_hours', 'attendance', 'previous_grade']]
y = df['result']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train the model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=3)
dt_model.fit(X_train, y_train)

# Test the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print()

Model Accuracy: 1.00



In [None]:
# ========== STEP 2: SAVE THE MODEL ==========
print("=== SAVING MODEL FOR PRODUCTION ===")

# Save the trained model
joblib.dump(dt_model, 'student_performance_model.pkl')
print("✅ Model saved as 'student_performance_model.pkl'")
print()

# ========== STEP 3: PRODUCTION PHASE - NEW DATA ARRIVES ==========
print("=== PRODUCTION PHASE - PREDICTING NEW DATA ===")

# Load the saved model (this would be done in production environment)
loaded_model = joblib.load('student_performance_model.pkl')
print("✅ Model loaded successfully")
print()

# ========== SCENARIO 1: Single New Student ==========
print("--- Scenario 1: Single New Student ---")

# New student data arrives
new_student = {
    'study_hours': 6,
    'attendance': 88,
    'previous_grade': 79
}

# Convert to DataFrame (same format as training)
new_student_df = pd.DataFrame([new_student])
print("New Student Data:")
print(new_student_df)

# Make prediction
prediction = loaded_model.predict(new_student_df)
prediction_proba = loaded_model.predict_proba(new_student_df)

print(f"Prediction: {prediction[0]}")
print(f"Probability - Fail: {prediction_proba[0][0]:.3f}, Pass: {prediction_proba[0][1]:.3f}")
print()

# ========== SCENARIO 2: Batch of New Students ==========
print("--- Scenario 2: Batch Processing ---")

# Multiple new students arrive
new_batch = {
    'study_hours': [3, 9, 5, 7],
    'attendance': [65, 96, 82, 89],
    'previous_grade': [55, 91, 76, 84]
}

new_batch_df = pd.DataFrame(new_batch)
print("New Batch Data:")
print(new_batch_df)

# Batch predictions
batch_predictions = loaded_model.predict(new_batch_df)
batch_probabilities = loaded_model.predict_proba(new_batch_df)

print("Batch Predictions:")
for i, (pred, prob) in enumerate(zip(batch_predictions, batch_probabilities)):
    print(f"Student {i+1}: {pred} (Pass probability: {prob[1]:.3f})")
print()

# ========== SCENARIO 3: REAL-TIME API FUNCTION ==========
print("--- Scenario 3: Real-time API Function ---")

def predict_student_performance(study_hours, attendance, previous_grade):
    """
    Function that would be called by an API endpoint
    """
    # Load model (in real API, you'd load this once at startup)
    model = joblib.load('student_performance_model.pkl')
    
    # Prepare input data
    input_data = pd.DataFrame({
        'study_hours': [study_hours],
        'attendance': [attendance], 
        'previous_grade': [previous_grade]
    })
    
    # Make prediction
    prediction = model.predict(input_data)[0]
    probability = model.predict_proba(input_data)[0]
    
    return {
        'prediction': prediction,
        'pass_probability': round(probability[1], 3),
        'confidence': 'High' if max(probability) > 0.8 else 'Medium' if max(probability) > 0.6 else 'Low'
    }

# Test the API function
result1 = predict_student_performance(study_hours=8, attendance=92, previous_grade=85)
result2 = predict_student_performance(study_hours=2, attendance=45, previous_grade=50)

print("API Function Results:")
print(f"Student A: {result1}")
print(f"Student B: {result2}")
print()

# ========== SCENARIO 4: HANDLING NEW DATA WITH VALIDATION ==========
print("--- Scenario 4: Production with Data Validation ---")

def robust_prediction(input_data):
    """
    Production-ready function with error handling
    """
    try:
        # Load model
        model = joblib.load('student_performance_model.pkl')
        
        # Validate input data
        required_columns = ['study_hours', 'attendance', 'previous_grade']
        
        if not all(col in input_data.columns for col in required_columns):
            return {"error": f"Missing required columns: {required_columns}"}
        
        # Basic data validation
        if (input_data['study_hours'] < 0).any() or (input_data['study_hours'] > 12).any():
            return {"error": "Study hours must be between 0-12"}
        
        if (input_data['attendance'] < 0).any() or (input_data['attendance'] > 100).any():
            return {"error": "Attendance must be between 0-100"}
            
        # Make prediction
        predictions = model.predict(input_data)
        probabilities = model.predict_proba(input_data)
        
        results = []
        for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
            results.append({
                'student_id': i+1,
                'prediction': pred,
                'pass_probability': round(prob[1], 3),
                'fail_probability': round(prob[0], 3)
            })
        
        return {"success": True, "results": results}
        
    except FileNotFoundError:
        return {"error": "Model file not found"}
    except Exception as e:
        return {"error": f"Prediction failed: {str(e)}"}

# Test robust function
test_data = pd.DataFrame({
    'study_hours': [7, 3],
    'attendance': [85, 60],
    'previous_grade': [80, 65]
})

robust_result = robust_prediction(test_data)
print("Robust Prediction Result:")
print(robust_result)

print("\n=== IMPLEMENTATION SUMMARY ===")
print("1. Train model on historical data")
print("2. Save model using joblib.dump()")  
print("3. Load model in production using joblib.load()")
print("4. Create prediction functions for different scenarios:")
print("   - Single predictions")
print("   - Batch processing") 
print("   - API endpoints")
print("   - Error handling & validation")
print("5. Model makes predictions on new data with same features")

=== TRAINING PHASE ===
Training Data:
   study_hours  attendance  previous_grade result
0            2          60              70   Fail
1            8          95              85   Pass
2            5          80              78   Pass
3            1          40              45   Fail
4            9          98              92   Pass
5            3          65              68   Fail
6            7          90              88   Pass
7            4          75              72   Pass
8            6          85              82   Pass
9           10         100              95   Pass

Model Accuracy: 1.00

=== SAVING MODEL FOR PRODUCTION ===
✅ Model saved as 'student_performance_model.pkl'

=== PRODUCTION PHASE - PREDICTING NEW DATA ===
✅ Model loaded successfully

--- Scenario 1: Single New Student ---
New Student Data:
   study_hours  attendance  previous_grade
0            6          88              79
Prediction: Pass
Probability - Fail: 0.000, Pass: 1.000

--- Scenario 2: Batch Proce