In [None]:

def load_breast_cancer_data(file_path):
    """Loads the breast cancer Wisconsin dataset from a CSV file."""
    if not os.path.exists(file_path):
        print(f"Error: Data file not found at {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        return None

if __name__ == '__main__':
    # Example usage with the specific path
    file_path_example = r"C:\Users\SABRINA PEREZ\anaconda3\Porgramacion-2\data\breast-cancer-wisconsin.data.csv"
    df = load_breast_cancer_data(file_path_example)
    if df is not None:
        print("Data loaded successfully (from example with specific path).")
        print(df.head())

In [None]:
# preprocessing.py
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_module(df):
    """
    Performs data exploration and preprocessing on the input DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        tuple: A tuple containing the scaled features (X_scaled) and the target variable (y) if present,
               otherwise just the scaled features (X_scaled, None).
    """
    print("--- Data Exploration ---")
    print("Data Info:")
    df.info()
    print("\nData Describe:")
    print(df.describe())
    print("\nData Value Counts for Each Column:")
    for col in df.columns:
        print(f"\nColumn: {col}")
        print(df[col].value_counts())
    print("\nNull Value Counts:")
    print(df.isnull().sum())
    # Add checks for other characters if needed (e.g., non-numeric in numeric columns)
    print("\n--- End of Data Exploration ---")

    print("\n--- Data Preprocessing ---")
    df = df.drop(columns=["id", "Unnamed: 32"], errors='ignore')
    if 'diagnosis' in df.columns:
        print("Encoding 'diagnosis' column.")
        df["diagnosis"] = LabelEncoder().fit_transform(df["diagnosis"])
        X = df.drop(columns=["diagnosis"])
        y = df["diagnosis"]
    else:
        print("Warning: 'diagnosis' column not found for label encoding.")
        X = df
        y = None

    print("Normalizing features.")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    print("--- End of Data Preprocessing ---")

    return X_scaled, y

if __name__ == '__main__':
    from module_load_data import load_breast_cancer_data
    import os

    # Assuming preprocessing.py is in the parent directory of the 'data' folder
    file_path_example = os.path.join(os.path.dirname(__file__), 'breast-cancer-wisconsin.data.csv')

    df = load_breast_cancer_data(file_path_example)
    if df is not None:
        X_scaled, y = preprocess_module(df)
        print("\nProcessed Data (first 5 rows of scaled features):")
        print(X_scaled[:5])
        if y is not None:
            print("\nTarget variable (first 5 values):")
            print(y[:5])
        else:
            print("\nNo target variable found after preprocessing.")
    else:
        print(f"Could not load data for preprocessing example from: {file_path_example}")

In [None]:
# model_training.py
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from datetime import datetime

def train_model_module(X, y, model_name="random_forest", test_size=0.2, random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3, n_neighbors=5, solver='lbfgs', max_iter=100, kernel='rbf', C=1.0, cv_folds=5):
    """
    Trains a specified machine learning model with cross-validation and returns the trained model and test sets.

    Args:
        X (pd.DataFrame or np.ndarray): Features.
        y (pd.Series or np.ndarray): Target variable.
        model_name (str): The name of the model to train ('random_forest', 'gradient_boosting', 'logistic_regression', 'knn', 'decision_tree', 'svm').
        test_size (float): Proportion of the data to use for the test set.
        random_state (int): Seed for random number generation.
        n_estimators (int): Number of trees in the Random Forest or Gradient Boosting.
        learning_rate (float): Learning rate for Gradient Boosting.
        max_depth (int): Maximum depth of the Decision Tree or Gradient Boosting.
        n_neighbors (int): Number of neighbors for KNN.
        solver (str): Solver to use for Logistic Regression.
        max_iter (int): Maximum number of iterations for Logistic Regression or SVM.
        kernel (str): Kernel type for SVM.
        C (float): Regularization parameter for Logistic Regression and SVM.
        cv_folds (int): Number of cross-validation folds.

    Returns:
        tuple: (trained model, X_test, y_test)
    """
    print(f"--- Model Training: {model_name} ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    if model_name == "random_forest":
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
        print(f"Training RandomForest with {n_estimators} estimators.")
    elif model_name == "gradient_boosting":
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
        print(f"Training GradientBoosting with {n_estimators} estimators, learning rate={learning_rate}, max depth={max_depth}.")
    elif model_name == "logistic_regression":
        model = LogisticRegression(solver=solver, max_iter=max_iter, random_state=random_state, C=C)
        print(f"Training Logistic Regression with solver='{solver}', max iterations={max_iter}, C={C}.")
    elif model_name == "knn":
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        print(f"Training KNN with {n_neighbors} neighbors.")
    elif model_name == "decision_tree":
        model = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
        print(f"Training Decision Tree with max depth={max_depth}.")
    elif model_name == "svm":
        model = SVC(kernel=kernel, C=C, max_iter=max_iter, random_state=random_state, probability=True) # probability=True for ROC curve
        print(f"Training SVM with kernel='{kernel}', C={C}, max iterations={max_iter}.")
    else:
        raise ValueError(f"Unsupported model name: {model_name}")

    model.fit(X_train, y_train)
    print("Model trained.")

    print(f"Performing {cv_folds}-fold stratified cross-validation.")
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"Cross-validation accuracy scores: {cv_scores}")
    print(f"Mean cross-validation accuracy: {cv_scores.mean():.2f}")
    print(f"--- End of Model Training: {model_name} ---")

    return model, X_test, y_test

if __name__ == '__main__':
    from module_load_data import load_breast_cancer_data
    from module_preprocessing import preprocess_module
    import os

 

In [None]:
# evaluation.py
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model_module(model, X_test, y_test, model_name="trained_model"):
    """
    Evaluates the trained model and prints metrics and plots.

    Args:
        model: Trained machine learning model (must have predict and predict_proba methods).
        X_test (pd.DataFrame or np.ndarray): Test features.
        y_test (pd.Series or np.ndarray): True labels for the test set.
        model_name (str): Name of the model for plot titles.
    """
    print(f"--- Model Evaluation: {model_name} ---")
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Plot Confusion Matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.savefig(f"confusion_matrix_{model_name}.png")
    plt.close()
    print(f"\nConfusion matrix saved as confusion_matrix_{model_name}.png")

    # Calculate ROC Curve (only if the model has predict_proba)
    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        roc_auc = auc(fpr, tpr)

        # Plot ROC Curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
        plt.xlabel('False Positive Rate (FPR)')
        plt.ylabel('True Positive Rate (TPR)')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend(loc='lower right')
        plt.grid()
        plt.savefig(f"roc_curve_{model_name}.png")
        plt.close()
        print(f"ROC curve saved as roc_curve_{model_name}.png")
        print(f"ROC AUC: {roc_auc:.2f}")
    else:
        print(f"\n{model_name} does not have predict_proba method, ROC curve cannot be calculated.")

    print(f"--- End of Model Evaluation: {model_name} ---")

if __name__ == '__main__':
    from module_load_data import load_breast_cancer_data
    from module_preprocessing import preprocess_module
    import os
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression

    