In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Create sample data if file doesn't exist
# In real-world scenario, you would use your actual dataset
try:
    # Try to load the file
    df = pd.read_csv('2012-2013-data-with-predictions-4-final.csv', encoding='latin1', on_bad_lines='skip')
except:
    print("Dataset not found. Creating synthetic data for demonstration...")
    # Create synthetic data that mimics educational data
    np.random.seed(42)
    n_samples = 10000

    # Generate synthetic features
    skill_names = np.random.randint(0, 20, n_samples)  # 20 different skills
    attempt_counts = np.random.randint(1, 10, n_samples)
    ms_first_response = np.random.randint(1000, 30000, n_samples)  # time in milliseconds
    tutor_modes = np.random.randint(0, 3, n_samples)  # 3 different tutor modes

    # Generate target based on a simple rule with some noise
    # Students are more likely to be correct if:
    # - They've had more attempts (practice effect)
    # - They respond quickly (indicating confidence)
    # - They're in certain tutor modes
    p_correct = 0.3 + 0.05 * attempt_counts + 20000 / (ms_first_response + 5000) + 0.1 * (tutor_modes == 1)
    p_correct = np.clip(p_correct, 0.1, 0.9)  # Keep probabilities reasonable
    correct = np.random.binomial(1, p_correct)

    # Create user_ids
    user_ids = np.random.randint(1000, 5000, n_samples)

    # Create DataFrame
    df = pd.DataFrame({
        'user_id': user_ids,
        'skill_name': [f"Skill_{i}" for i in skill_names],
        'correct': correct,
        'attempt_count': attempt_counts,
        'ms_first_response': ms_first_response,
        'tutor_mode': [f"Mode_{i}" for i in tutor_modes]
    })

print("Initial Data Sample:")
display(df.head())

# Basic data preprocessing
print("\nData Overview:")
print(f"Shape: {df.shape}")
print("\nMissing values:")
print(df.isnull().sum())

# Drop rows with missing values
df_cleaned = df.dropna()
print(f"\nShape after dropping missing values: {df_cleaned.shape}")

# Encode categorical variables
df_cleaned['skill_name_code'] = df_cleaned['skill_name'].astype('category').cat.codes
df_cleaned['tutor_mode_code'] = df_cleaned['tutor_mode'].astype('category').cat.codes

# Feature engineering
print("\nFeature Engineering:")
# Converting response time to seconds for better interpretability
df_cleaned['response_time_sec'] = df_cleaned['ms_first_response'] / 1000
# Creating a new feature for response speed category
df_cleaned['response_speed'] = pd.cut(
    df_cleaned['response_time_sec'],
    bins=[0, 5, 15, 30, float('inf')],
    labels=['very_fast', 'fast', 'medium', 'slow']
)

# Data visualization
plt.figure(figsize=(12, 6))

# Plot 1: Correct answers by attempt count
plt.subplot(1, 2, 1)
attempt_success = df_cleaned.groupby('attempt_count')['correct'].mean()
attempt_success.plot(kind='bar', color='skyblue')
plt.title('Success Rate by Attempt Count')
plt.xlabel('Number of Attempts')
plt.ylabel('Success Rate')

# Plot 2: Response time distribution by correctness
plt.subplot(1, 2, 2)
sns.histplot(data=df_cleaned, x='response_time_sec', hue='correct',
             bins=30, kde=True, element="step", common_norm=False,
             stat="density", log_scale=(False, True))
plt.title('Response Time Distribution by Correctness')
plt.xlabel('Response Time (seconds)')
plt.ylabel('Density')
plt.xlim(0, 60)  # Limit to 60 seconds for better visualization
plt.tight_layout()
plt.show()

# Feature Selection for ML
print("\nPreparing features for machine learning...")
features = [
    'skill_name_code',
    'attempt_count',
    'ms_first_response',
    'tutor_mode_code'
]

X = df_cleaned[features]
y = df_cleaned['correct']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Create a pipeline with preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Run grid search with cross-validation
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
print("\nPerforming grid search for hyperparameter tuning...")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot ROC curve
y_prob = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
if hasattr(best_model[-1], 'feature_importances_'):
    importances = best_model[-1].feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.bar(range(X.shape[1]), importances[indices], align="center")
    plt.xticks(range(X.shape[1]), [X.columns[i] for i in indices], rotation=45)
    plt.tight_layout()
    plt.show()

# Create a function for predictions
def predict_performance(model, skill_name_code, attempts, ms_time, tutor_mode_code):
    """
    Predict student performance based on input features

    Parameters:
    -----------
    model: trained sklearn model
    skill_name_code: int, encoded skill name
    attempts: int, number of attempts
    ms_time: int, response time in milliseconds
    tutor_mode_code: int, encoded tutor mode

    Returns:
    --------
    prediction: str, "Correct" or "Incorrect"
    probability: float, probability of being correct
    """
    input_data = pd.DataFrame([{
        'skill_name_code': skill_name_code,
        'attempt_count': attempts,
        'ms_first_response': ms_time,
        'tutor_mode_code': tutor_mode_code
    }])

    prediction = model.predict(input_data)[0]
    probability = model.predict_proba(input_data)[0][1]  # Probability of being correct

    return "Correct" if prediction == 1 else "Incorrect", probability

# Example prediction
skill_code = 10
attempts = 3
response_time = 5000  # ms
tutor_mode = 1

result, prob = predict_performance(best_model, skill_code, attempts, response_time, tutor_mode)
print(f"\nPrediction for test input (skill={skill_code}, attempts={attempts}, response_time={response_time}ms, tutor_mode={tutor_mode}):")
print(f"Prediction: {result} (Probability of being correct: {prob:.2f})")

# Create a simple interactive prediction function
def interactive_predictions(model, df):
    """Run predictions with different parameter combinations"""
    # Get unique values from the dataset
    unique_skills = sorted(df['skill_name_code'].unique())
    min_attempts = int(df['attempt_count'].min())
    max_attempts = int(df['attempt_count'].min())
    min_time = int(df['ms_first_response'].min())
    max_time = int(df['ms_first_response'].max())
    unique_modes = sorted(df['tutor_mode_code'].unique())

    # Create a grid of combinations
    print("\nPredictions for various parameter combinations:")
    print("-----------------------------------------------")
    print(f"{'Skill':<6} {'Attempts':<10} {'Response Time':<15} {'Tutor Mode':<12} {'Prediction':<12} {'Probability':<12}")
    print("-" * 70)

    # Sample some combinations
    for skill in unique_skills[:3]:  # First 3 skills
        for attempts in [1, 3, 5]:
            for time in [2000, 10000, 20000]:
                for mode in unique_modes[:2]:  # First 2 modes
                    result, prob = predict_performance(model, skill, attempts, time, mode)
                    print(f"{skill:<6} {attempts:<10} {time:<15} {mode:<12} {result:<12} {prob:.2f}")

# Run the interactive predictions
interactive_predictions(best_model, df_cleaned)

# Save the model for future use
import joblib
joblib.dump(best_model, 'student_performance_model.pkl')
print("\nModel saved as 'student_performance_model.pkl'")

print("\nSummary:")
print("1. Created a robust student performance prediction model")
print("2. Performed feature engineering and visualization")
print("3. Used hyperparameter tuning to optimize the model")
print("4. Evaluated model performance with classification metrics and ROC curve")
print("5. Built a prediction function for real-time assessment")
print("6. Saved the model for future use")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Create sample data if file doesn't exist
# In real-world scenario, you would use your actual dataset
try:
    # Try to load the file
    df = pd.read_csv('2012-2013-data-with-predictions-4-final.csv', encoding='latin1', on_bad_lines='skip')
except:
    print("Dataset not found. Creating synthetic data for demonstration...")
    # Create synthetic data that mimics educational data
    np.random.seed(42)
    n_samples = 10000

    # Generate synthetic features
    skill_names = np.random.randint(0, 20, n_samples)  # 20 different skills
    attempt_counts = np.random.randint(1, 10, n_samples)
    ms_first_response = np.random.randint(1000, 30000, n_samples)  # time in milliseconds
    tutor_modes = np.random.randint(0, 3, n_samples)  # 3 different tutor modes

    # Generate target based on a simple rule with some noise
    # Students are more likely to be correct if:
    # - They've had more attempts (practice effect)
    # - They respond quickly (indicating confidence)
    # - They're in certain tutor modes
    p_correct = 0.3 + 0.05 * attempt_counts + 20000 / (ms_first_response + 5000) + 0.1 * (tutor_modes == 1)
    p_correct = np.clip(p_correct, 0.1, 0.9)  # Keep probabilities reasonable
    correct = np.random.binomial(1, p_correct)

    # Create user_ids
    user_ids = np.random.randint(1000, 5000, n_samples)

    # Create DataFrame
    df = pd.DataFrame({
        'user_id': user_ids,
        'skill_name': [f"Skill_{i}" for i in skill_names],
        'correct': correct,
        'attempt_count': attempt_counts,
        'ms_first_response': ms_first_response,
        'tutor_mode': [f"Mode_{i}" for i in tutor_modes]
    })

print("Initial Data Sample:")
display(df.head())

# Basic data preprocessing
print("\nData Overview:")
print(f"Shape: {df.shape}")
print("\nMissing values:")
print(df.isnull().sum())

# Drop rows with missing values
df_cleaned = df.dropna()
print(f"\nShape after dropping missing values: {df_cleaned.shape}")

# Encode categorical variables
df_cleaned['skill_name_code'] = df_cleaned['skill_name'].astype('category').cat.codes
df_cleaned['tutor_mode_code'] = df_cleaned['tutor_mode'].astype('category').cat.codes

# Feature engineering
print("\nFeature Engineering:")
# Converting response time to seconds for better interpretability
df_cleaned['response_time_sec'] = df_cleaned['ms_first_response'] / 1000
# Creating a new feature for response speed category
df_cleaned['response_speed'] = pd.cut(
    df_cleaned['response_time_sec'],
    bins=[0, 5, 15, 30, float('inf')],
    labels=['very_fast', 'fast', 'medium', 'slow']
)

# Data visualization
plt.figure(figsize=(12, 6))

# Plot 1: Correct answers by attempt count
plt.subplot(1, 2, 1)
attempt_success = df_cleaned.groupby('attempt_count')['correct'].mean()
attempt_success.plot(kind='bar', color='skyblue')
plt.title('Success Rate by Attempt Count')
plt.xlabel('Number of Attempts')
plt.ylabel('Success Rate')

# Plot 2: Response time distribution by correctness
plt.subplot(1, 2, 2)
sns.histplot(data=df_cleaned, x='response_time_sec', hue='correct',
             bins=30, kde=True, element="step", common_norm=False,
             stat="density", log_scale=(False, True))
plt.title('Response Time Distribution by Correctness')
plt.xlabel('Response Time (seconds)')
plt.ylabel('Density')
plt.xlim(0, 60)  # Limit to 60 seconds for better visualization
plt.tight_layout()
plt.show()

# Feature Selection for ML
print("\nPreparing features for machine learning...")
features = [
    'skill_name_code',
    'attempt_count',
    'ms_first_response',
    'tutor_mode_code'
]

X = df_cleaned[features]
y = df_cleaned['correct']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Create a pipeline with preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Run grid search with cross-validation
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
print("\nPerforming grid search for hyperparameter tuning...")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot ROC curve
y_prob = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
if hasattr(best_model[-1], 'feature_importances_'):
    importances = best_model[-1].feature_importances_
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.bar(range(X.shape[1]), importances[indices], align="center")
    plt.xticks(range(X.shape[1]), [X.columns[i] for i in indices], rotation=45)
    plt.tight_layout()
    plt.show()

# Create a function for predictions
def predict_performance(model, skill_name_code, attempts, ms_time, tutor_mode_code):
    """
    Predict student performance based on input features

    Parameters:
    -----------
    model: trained sklearn model
    skill_name_code: int, encoded skill name
    attempts: int, number of attempts
    ms_time: int, response time in milliseconds
    tutor_mode_code: int, encoded tutor mode

    Returns:
    --------
    prediction: str, "Correct" or "Incorrect"
    probability: float, probability of being correct
    """
    input_data = pd.DataFrame([{
        'skill_name_code': skill_name_code,
        'attempt_count': attempts,
        'ms_first_response': ms_time,
        'tutor_mode_code': tutor_mode_code
    }])

    prediction = model.predict(input_data)[0]
    probability = model.predict_proba(input_data)[0][1]  # Probability of being correct

    return "Correct" if prediction == 1 else "Incorrect", probability

# Example prediction
skill_code = 10
attempts = 3
response_time = 5000  # ms
tutor_mode = 1

result, prob = predict_performance(best_model, skill_code, attempts, response_time, tutor_mode)
print(f"\nPrediction for test input (skill={skill_code}, attempts={attempts}, response_time={response_time}ms, tutor_mode={tutor_mode}):")
print(f"Prediction: {result} (Probability of being correct: {prob:.2f})")

# Create a simple interactive prediction function
def interactive_predictions(model, df):
    """Run predictions with different parameter combinations"""
    # Get unique values from the dataset
    unique_skills = sorted(df['skill_name_code'].unique())
    min_attempts = int(df['attempt_count'].min())
    max_attempts = int(df['attempt_count'].min())
    min_time = int(df['ms_first_response'].min())
    max_time = int(df['ms_first_response'].max())
    unique_modes = sorted(df['tutor_mode_code'].unique())

    # Create a grid of combinations
    print("\nPredictions for various parameter combinations:")
    print("-----------------------------------------------")
    print(f"{'Skill':<6} {'Attempts':<10} {'Response Time':<15} {'Tutor Mode':<12} {'Prediction':<12} {'Probability':<12}")
    print("-" * 70)

    # Sample some combinations
    for skill in unique_skills[:3]:  # First 3 skills
        for attempts in [1, 3, 5]:
            for time in [2000, 10000, 20000]:
                for mode in unique_modes[:2]:  # First 2 modes
                    result, prob = predict_performance(model, skill, attempts, time, mode)
                    print(f"{skill:<6} {attempts:<10} {time:<15} {mode:<12} {result:<12} {prob:.2f}")

# Run the interactive predictions
interactive_predictions(best_model, df_cleaned)

# Save the model for future use
import joblib
joblib.dump(best_model, 'student_performance_model.pkl')
print("\nModel saved as 'student_performance_model.pkl'")

print("\nSummary:")
print("1. Created a robust student performance prediction model")
print("2. Performed feature engineering and visualization")
print("3. Used hyperparameter tuning to optimize the model")
print("4. Evaluated model performance with classification metrics and ROC curve")
print("5. Built a prediction function for real-time assessment")
print("6. Saved the model for future use")