In [10]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -------------------------
# 1. Load the dataset
# -------------------------
df = pd.read_csv("tasks.csv")  # Ensure this CSV is in the same directory

# Drop rows with missing values in the 'Priority' column
df.dropna(subset=['Priority'], inplace=True)

# -------------------------
# 2. Feature Extraction
# -------------------------
X = df[["Importance", "Effort", "Days_Left"]]
y = df["Priority"]

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------
# 3. Train-Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42
)

# -------------------------
# 4. Train the Model
# -------------------------
# Adjusted hyperparameters to further reduce accuracy and overfitting
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,  # Increased from 2
    min_samples_leaf=3,   # Increased from 1
    random_state=42,
    bootstrap=True
)
model.fit(X_train, y_train)

# -------------------------
# 5. Evaluation
# -------------------------
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Accuracy on test data: {accuracy * 100:.2f}%\n")

print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------
# 6. Save Model, Scaler & Encoder
# -------------------------
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/super_model.pkl")
joblib.dump(scaler, "model/super_scaler.pkl")
joblib.dump(label_encoder, "model/label_encoder.pkl")

print("\n✅ All files saved to 'model/' directory successfully.")


✅ Accuracy on test data: 100.00%

Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00        78
         Low       1.00      1.00      1.00        75
      Medium       1.00      1.00      1.00        78

    accuracy                           1.00       231
   macro avg       1.00      1.00      1.00       231
weighted avg       1.00      1.00      1.00       231

Confusion Matrix:
 [[78  0  0]
 [ 0 75  0]
 [ 0  0 78]]

✅ All files saved to 'model/' directory successfully.


In [11]:
# -------------------------
# 7. Load Model, Scaler & Encoder
# -------------------------
loaded_model = joblib.load("model/super_model.pkl")
loaded_scaler = joblib.load("model/super_scaler.pkl")
loaded_label_encoder = joblib.load("model/label_encoder.pkl")

print("✅ Model, scaler, and encoder loaded successfully.")

✅ Model, scaler, and encoder loaded successfully.


In [12]:
# -------------------------
# 8. Make a Prediction for a New Task
# -------------------------
# Example new task: Importance=4, Effort=7, Days_Left=2
new_task_features = np.array([[4, 7, 2]])

# Scale the new task features
new_task_features_scaled = loaded_scaler.transform(new_task_features)

# Predict the priority (encoded)
predicted_priority_encoded = loaded_model.predict(new_task_features_scaled)

# Decode the predicted priority
predicted_priority = loaded_label_encoder.inverse_transform(predicted_priority_encoded)

print(f"\nPredicted priority for the new task: {predicted_priority[0]}")


Predicted priority for the new task: High




# Task
Train and test the existing Random Forest model with new random values, aiming for an accuracy of around 95%.

## Generate synthetic data

### Subtask:
Create new data points with random values for 'Importance', 'Effort', and 'Days_Left', and assign corresponding 'Priority' labels.


**Reasoning**:
Generate new data points with random values for 'Importance', 'Effort', and 'Days_Left', create a DataFrame, and assign 'Priority' based on these values.



In [13]:
num_samples = 500
new_importance = np.random.randint(1, 6, num_samples)
new_effort = np.random.randint(1, 11, num_samples)
new_days_left = np.random.randint(1, 16, num_samples)

data = {
    'Importance': new_importance,
    'Effort': new_effort,
    'Days_Left': new_days_left
}

new_df = pd.DataFrame(data)

def assign_priority(row):
    score = (row['Importance'] * 3) + (row['Effort'] * 2) - row['Days_Left']
    if score > 15:
        return 'High'
    elif score > 8:
        return 'Medium'
    else:
        return 'Low'

new_df['Priority'] = new_df.apply(assign_priority, axis=1)

display(new_df.head())

Unnamed: 0,Importance,Effort,Days_Left,Priority
0,1,10,9,Medium
1,2,8,9,Medium
2,1,8,8,Medium
3,4,1,11,Low
4,5,8,14,High


## Combine original and synthetic data

### Subtask:
Merge the newly generated data with the original dataset.


**Reasoning**:
Concatenate the original and new dataframes and display the head and shape of the combined dataframe.



In [14]:
df_combined = pd.concat([df, new_df], ignore_index=True)
display(df_combined.head())
display(df_combined.shape)

Unnamed: 0,Task_Name,Importance,Effort,Days_Left,Priority
0,Write Proposal,4.0,6.0,1.0,High
1,Review Budget,3.0,4.0,5.0,Medium
2,Update Website,5.0,8.0,2.0,High
3,Schedule Meeting,2.0,1.0,7.0,Low
4,Analyze Data,4.0,5.0,3.0,Medium


(1654, 5)

## Preprocess the combined data

### Subtask:
Apply the same preprocessing steps (handling missing values, encoding, scaling) to the combined dataset as were applied to the original data.


**Reasoning**:
Apply the same preprocessing steps to the combined dataset as were applied to the original data, including handling missing values, encoding the target, and scaling the features using the pre-fitted objects.



In [15]:
# Drop rows with missing values in the 'Priority' column in the combined data
df_combined.dropna(subset=['Priority'], inplace=True)

# Extract features and target from combined data
X_combined = df_combined[["Importance", "Effort", "Days_Left"]]
y_combined = df_combined["Priority"]

# Encode target labels using the pre-fitted label_encoder
y_combined_encoded = loaded_label_encoder.transform(y_combined)

# Feature Scaling using the pre-fitted scaler
X_combined_scaled = loaded_scaler.transform(X_combined)

print("✅ Preprocessing applied to the combined dataset.")

✅ Preprocessing applied to the combined dataset.


## Train and evaluate the model

### Subtask:
Train the Random Forest model on the combined and preprocessed data and evaluate its accuracy.


**Reasoning**:
Train and evaluate the model on the combined dataset.



In [16]:
# Split combined data into training and testing sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined_scaled, y_combined_encoded, test_size=0.2, random_state=42
)

# Train the loaded model on the combined training data
loaded_model.fit(X_train_combined, y_train_combined)

# Make predictions on the combined test data
y_pred_combined = loaded_model.predict(X_test_combined)

# Calculate and print accuracy
accuracy_combined = accuracy_score(y_test_combined, y_pred_combined)
print(f"\n✅ Accuracy on combined test data: {accuracy_combined * 100:.2f}%\n")

# Print classification report
print("Classification Report (Combined Data):\n", classification_report(y_test_combined, y_pred_combined, target_names=loaded_label_encoder.classes_))

# Print confusion matrix
print("Confusion Matrix (Combined Data):\n", confusion_matrix(y_test_combined, y_pred_combined))


✅ Accuracy on combined test data: 96.68%

Classification Report (Combined Data):
               precision    recall  f1-score   support

        High       0.99      0.96      0.97       119
         Low       0.99      0.97      0.98       122
      Medium       0.91      0.98      0.94        90

    accuracy                           0.97       331
   macro avg       0.96      0.97      0.96       331
weighted avg       0.97      0.97      0.97       331

Confusion Matrix (Combined Data):
 [[114   0   5]
 [  0 118   4]
 [  1   1  88]]


## Adjust for desired accuracy (if needed)

### Subtask:
Since the accuracy is higher than the target of 95%, we need to adjust the data or model to bring the accuracy closer to the target. We will try reducing the number of synthetic samples to make the dataset more reflective of the original data distribution, which had perfect accuracy.


**Reasoning**:
Generate a smaller number of synthetic samples, combine with the original data, preprocess, split, train the loaded model, and evaluate its accuracy.



In [17]:
# 1. Generate a smaller number of synthetic samples (e.g., 100 instead of 500)
num_samples_smaller = 100
new_importance_smaller = np.random.randint(1, 6, num_samples_smaller)
new_effort_smaller = np.random.randint(1, 11, num_samples_smaller)
new_days_left_smaller = np.random.randint(1, 16, num_samples_smaller)

data_smaller = {
    'Importance': new_importance_smaller,
    'Effort': new_effort_smaller,
    'Days_Left': new_days_left_smaller
}

new_df_smaller = pd.DataFrame(data_smaller)

# 2. Assign 'Priority' labels to these new samples
def assign_priority(row):
    score = (row['Importance'] * 3) + (row['Effort'] * 2) - row['Days_Left']
    if score > 15:
        return 'High'
    elif score > 8:
        return 'Medium'
    else:
        return 'Low'

new_df_smaller['Priority'] = new_df_smaller.apply(assign_priority, axis=1)

# 3. Combine the original dataset (`df`) with this smaller set of new synthetic data.
df_combined_smaller = pd.concat([df, new_df_smaller], ignore_index=True)

# 4. Apply the same preprocessing steps
# Drop rows with missing values in the 'Priority' column
df_combined_smaller.dropna(subset=['Priority'], inplace=True)

# Extract features and target
X_combined_smaller = df_combined_smaller[["Importance", "Effort", "Days_Left"]]
y_combined_smaller = df_combined_smaller["Priority"]

# Encode target labels using the pre-fitted label_encoder
y_combined_smaller_encoded = loaded_label_encoder.transform(y_combined_smaller)

# Feature Scaling using the pre-fitted scaler
X_combined_smaller_scaled = loaded_scaler.transform(X_combined_smaller)

# 5. Split the preprocessed combined data into training and testing sets.
X_train_combined_smaller, X_test_combined_smaller, y_train_combined_smaller_encoded, y_test_combined_smaller_encoded = train_test_split(
    X_combined_smaller_scaled, y_combined_smaller_encoded, test_size=0.2, random_state=42
)

# 6. Train the `loaded_model` on the training set of this new combined data.
loaded_model.fit(X_train_combined_smaller, y_train_combined_smaller_encoded)

# 7. Evaluate the model's accuracy on the testing set and print the resulting accuracy.
y_pred_combined_smaller_encoded = loaded_model.predict(X_test_combined_smaller)
accuracy_combined_smaller = accuracy_score(y_test_combined_smaller_encoded, y_pred_combined_smaller_encoded)
print(f"\n✅ Accuracy on smaller combined test data: {accuracy_combined_smaller * 100:.2f}%\n")

# Print classification report and confusion matrix for the smaller combined data
print("Classification Report (Smaller Combined Data):\n", classification_report(y_test_combined_smaller_encoded, y_pred_combined_smaller_encoded, target_names=loaded_label_encoder.classes_))
print("Confusion Matrix (Smaller Combined Data):\n", confusion_matrix(y_test_combined_smaller_encoded, y_pred_combined_smaller_encoded))


✅ Accuracy on smaller combined test data: 98.80%

Classification Report (Smaller Combined Data):
               precision    recall  f1-score   support

        High       1.00      0.96      0.98        80
         Low       1.00      1.00      1.00        88
      Medium       0.97      1.00      0.98        83

    accuracy                           0.99       251
   macro avg       0.99      0.99      0.99       251
weighted avg       0.99      0.99      0.99       251

Confusion Matrix (Smaller Combined Data):
 [[77  0  3]
 [ 0 88  0]
 [ 0  0 83]]


## Summary:

### Data Analysis Key Findings

*   Training the Random Forest model on the combined dataset (original + 500 synthetic samples) resulted in a high accuracy of 96.68\% on the test data, exceeding the initial target of around 95\%.
*   Reducing the number of synthetic samples added to the original data from 500 to 100 still maintained a high model accuracy of 98.80\%, further surpassing the target accuracy and indicating that even a smaller amount of synthetic data contributes to strong performance.
*   The classification reports and confusion matrices for both combined datasets showed robust performance across all priority classes (High, Low, Medium), with minimal misclassifications.

### Insights or Next Steps

*   The model demonstrates excellent performance on both combined datasets, indicating its ability to generalize well.
*   To achieve an accuracy closer to exactly 95\%, further adjustments to the amount of synthetic data, the distribution of synthetic data, or model hyperparameters may be necessary.
