In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Load the dataset from CSV
df = pd.read_csv('../data/ques_distb_data.csv')

# Features and target variable
X = df.drop('Long_Question_Proportion', axis=1)  # Features
y = df['Long_Question_Proportion']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gradient Boosting Regressor
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Replace negative predictions with 0
y_pred = np.maximum(y_pred, 0)

# Round the predictions to 2 decimal places
y_pred = np.round(y_pred, 2)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

# Prepare the output DataFrame
output_df = X_test.copy()  # Copy features from the test set
output_df['Actual Long Question Proportion'] = y_test.values  # Add actual target values
output_df['Predicted Long Question Proportion'] = y_pred  # Add predicted values

# Save the output to a new CSV file
output_df.to_csv('predicted_long_question_proportion.csv', index=False)

print("Predictions saved to 'predicted_long_question_proportion.csv'")

with open('question_distribution_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Assignment prediction model saved.") 



Mean Squared Error: 0.0032
R-squared: 0.9306
Predictions saved to 'predicted_long_question_proportion.csv'
Assignment prediction model saved.
