In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load the dataset with the workload column
df = pd.read_csv('../data/dataset_with_workload.csv')  # Replace with your CSV file path

# Features now include the workload column
X = df[['assignment_difficulty', 'active_assignments_count', 'question_type_distribution', 'workload']]
y = df['historical_avg_completion_time']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate error metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'R² Score: {r2}')

# Example of predicting the number of days with new data
new_data = [[4, 2, 0.52, 3.0]]  # Add workload value in new_data
predicted_days = model.predict(new_data)
print(f'Predicted days to complete assignment: {predicted_days[0]}')

# Save the test set's actual and predicted values to a CSV file
output_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
output_df.to_csv('assignment_predictions.csv', index=False)
print("Predictions saved to 'assignment_predictions.csv'")


Mean Absolute Error: 1.6989000000000036
R² Score: nan
Predicted days to complete assignment: 4.5505999999999975
Predictions saved to 'assignment_predictions.csv'


