Libraries

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor


Loading the Dataset

In [60]:
df = pd.read_csv("data.csv")

Pre-processing

In [61]:
# Drop 'project_id', 1it's just an identifier
df = df.drop(columns=["project_id"])

In [62]:
# Separate features and target
X = df.drop(columns=["success_probability"])
y = df["success_probability"]

In [63]:
# One-Hot Encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=False)

In [64]:
print("Original shape:", X.shape)
print("Encoded shape:", X_encoded.shape)
print("Encoded feature columns:", X_encoded.columns.tolist())

Original shape: (500, 18)
Encoded shape: (500, 45)
Encoded feature columns: ['avg_dev_experience', 'pm_experience', 'legacy_system_involved', 'budget_estimation', 'project_complexity_High', 'project_complexity_Low', 'project_complexity_Medium', 'scope_clarity_Clear', 'scope_clarity_Medium', 'scope_clarity_Vague', 'urgency_level_High', 'urgency_level_Low', 'urgency_level_Medium', 'org_structure_type_Functional', 'org_structure_type_Matrix', 'org_structure_type_Projectized', 'client_priority_Cost', 'client_priority_Quality', 'client_priority_Time', 'team_sdlc_knowledge_High', 'team_sdlc_knowledge_Low', 'team_sdlc_knowledge_Medium', 'user_involvement_High', 'user_involvement_Low', 'user_involvement_Medium', 'tool_familiarity_High', 'tool_familiarity_Low', 'tool_familiarity_Medium', 'tech_stack_familiarity_High', 'tech_stack_familiarity_Low', 'tech_stack_familiarity_Medium', 'testing_strategy_Automated', 'testing_strategy_Manual', 'testing_strategy_Mixed', 'on_schedule_NO', 'on_schedule_YE

Train/Test Split + Model Training

In [65]:
# 1. Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [66]:
# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize model
rf = RandomForestRegressor(random_state=42)

# Randomized search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=25,  # Number of combinations to try
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)

# Fit the model
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_

print("✅ Best Parameters:")
print(random_search.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
✅ Best Parameters:
{'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}


In [67]:
# 3. Make predictions on the test set
y_pred = random_search.predict(X_test)

In [68]:
# 4. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [69]:
print("📊 Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

📊 Model Evaluation:
Mean Absolute Error (MAE): 0.3022
Mean Squared Error (MSE): 0.1104
R² Score: -0.0427


Save the Tuned Model

In [71]:
import joblib

# Save the trained (tuned) model
joblib.dump(best_rf, "model.pkl")

print("✅ Model saved as 'model.pkl'")

✅ Model saved as 'model.pkl'
