# Explore here

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

# Set a random seed for reproducibility across all steps
RANDOM_STATE = 42
TEST_SIZE = 0.2

print("Starting Random Forest Diabetes Prediction Project...\n")

# --- Step 1: Loading the dataset ---
print("--- Step 1: Loading the dataset ---")
# Assuming 'diabetes_processed.csv' is in the root directory where the script is run
# If it's in a 'data/processed/' folder, adjust the path: 'data/processed/diabetes_processed.csv'
file_path = 'diabetes_processed.csv' # Or 'data/processed/diabetes_processed.csv' if structured that way

try:
    df_processed = pd.read_csv(file_path)
    # Assuming 'Outcome' is your target variable for diabetes prediction
    X = df_processed.drop('Outcome', axis=1)
    y = df_processed['Outcome']

    # Splitting the dataset into training and testing sets
    # Using random_state for reproducibility, and stratify=y for balanced classes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
    print("Dataset loaded and split successfully.")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory (e.g., in 'data/processed/').")
    print("Exiting. Please place your 'diabetes_processed.csv' file correctly.")
    exit() # Exit if the data isn't found

# Display first few rows of training data to confirm
print("\nX_train head:")
print(X_train.head())
print("\ny_train value counts:")
print(y_train.value_counts(normalize=True))

# --- Step 2: Build a random forest ---
print("\n--- Step 2: Building and Optimizing a Random Forest ---")

# --- 2a. Train a baseline Random Forest model ---
print("\n2a. Training a Baseline Random Forest Model:")
rf_baseline = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1) # n_jobs=-1 uses all CPU cores
rf_baseline.fit(X_train, y_train)

y_pred_baseline = rf_baseline.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)

print(f"Baseline Random Forest Accuracy: {accuracy_baseline:.4f}")
print("\nBaseline Classification Report:")
print(classification_report(y_test, y_pred_baseline))
print("\nBaseline Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_baseline))

# --- 2b. Experiment with hyperparameters and analyze impact ---
print("\n2b. Experimenting with Hyperparameters:")

# Experiment with n_estimators
n_estimators_values = [50, 100, 200, 300, 400, 500]
accuracy_n_estimators = []

print("\nTesting different n_estimators values:")
for n_est in n_estimators_values:
    print(f"  Training with n_estimators={n_est}...")
    rf_model = RandomForestClassifier(n_estimators=n_est, random_state=RANDOM_STATE, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_n_estimators.append(acc)
    print(f"    Accuracy: {acc:.4f}")

# Plotting the impact of n_estimators
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_values, accuracy_n_estimators, marker='o')
plt.title('Random Forest Accuracy vs. Number of Estimators')
plt.xlabel('Number of Estimators (n_estimators)')
plt.ylabel('Accuracy')
plt.grid(True)
plt.xticks(n_estimators_values)
plt.savefig('rf_accuracy_n_estimators.png') # Save plot for analysis
plt.show() # Display the plot
plt.close() # Close the plot to free memory

# Experiment with max_depth
# Use a fixed n_estimators for consistency, e.g., 200 based on common practice or initial n_estimators plot insights
max_depth_values = [None, 5, 10, 15, 20, 25] # None means unlimited depth
accuracy_max_depth = []

print("\nTesting different max_depth values:")
for max_d in max_depth_values:
    print(f"  Training with max_depth={max_d}...")
    rf_model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, max_depth=max_d, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_max_depth.append(acc)
    print(f"    Accuracy: {acc:.4f}")

# Plotting the impact of max_depth
plt.figure(figsize=(10, 6))
# Plotting against index to handle 'None' on the x-axis properly with custom labels
plt.plot(range(len(max_depth_values)), accuracy_max_depth, marker='o')
plt.title('Random Forest Accuracy vs. Max Depth')
plt.xlabel('Max Depth (max_depth)')
plt.ylabel('Accuracy')
plt.grid(True)
plt.xticks(range(len(max_depth_values)), labels=[str(d) if d is not None else 'None' for d in max_depth_values]) # Set custom xticks
plt.savefig('rf_accuracy_max_depth.png') # Save plot
plt.show() # Display the plot
plt.close() # Close the plot

# --- 2c. More Systematic Hyperparameter Tuning with GridSearchCV ---
print("\n2c. More Systematic Hyperparameter Tuning with GridSearchCV:")
# Define a parameter grid based on your exploration and common practices
param_grid = {
    'n_estimators': [100, 200, 300], # Refined based on n_estimators plot
    'max_depth': [5, 10, 15, None],  # Refined based on max_depth plot
    'min_samples_split': [2, 5, 10], # Common values to explore
    'min_samples_leaf': [1, 2, 4]    # Common values to explore
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=RANDOM_STATE),
                           param_grid=param_grid,
                           cv=5, # 5-fold cross-validation
                           n_jobs=-1, # Use all available cores
                           scoring='accuracy', # Optimize for accuracy
                           verbose=2) # Verbose output to see progress

grid_search.fit(X_train, y_train)

print(f"\nBest parameters found by GridSearchCV: {grid_search.best_params_}")
print(f"Best cross-validation accuracy on training set (GridSearchCV): {grid_search.best_score_:.4f}")

# Get the best estimator from GridSearchCV
rf_best_model = grid_search.best_estimator_

# Evaluate the best model on the unseen test set
y_pred_best = rf_best_model.predict(X_test)
final_accuracy_best_grid = accuracy_score(y_test, y_pred_best)

print(f"\nFinal Random Forest Accuracy on Test Set (with best GridSearchCV params): {final_accuracy_best_grid:.4f}")
print("\nFinal Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))
print("\nFinal Confusion Matrix (Best Model):")
print(confusion_matrix(y_test, y_pred_best))

# --- Step 3: Save the model ---
print("\n--- Step 3: Saving the Model ---")
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True) # Create the 'models' directory if it doesn't exist

model_filename = os.path.join(models_dir, 'random_forest_diabetes_model.pkl')

try:
    with open(model_filename, 'wb') as file:
        pickle.dump(rf_best_model, file)
    print(f"Model successfully saved to: {model_filename}")
except Exception as e:
    print(f"Error saving model: {e}")

print("\nRandom Forest Diabetes Prediction Project Completed.")

ModuleNotFoundError: No module named 'pandas'