In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Cargar dataset
df = pd.read_csv('../data/data.csv')

# Limpieza de datos
if 'Unnamed: 32' in df.columns:
    df = df.drop(columns=['Unnamed: 32'])
    print("Columna 'Unnamed: 32' eliminada.")

varianzas = df.select_dtypes(include=[np.number]).var()
cols_var_cero = varianzas[varianzas == 0].index.tolist()
if cols_var_cero:
    df = df.drop(columns=cols_var_cero)
    print(f"Columnas eliminadas por varianza cero: {cols_var_cero}")
else:
    print("No se encontraron columnas con varianza cero.")

# Mapear variable objetivo a números
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

# Separar características y target
X = df.drop(columns=['id', 'diagnosis'])
y = df['diagnosis']

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear pipeline con scaler y modelo
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

# Definir grid de hiperparámetros
param_grid = {
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 2]
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Resultados
print(f"Mejores hiperparámetros: {grid_search.best_params_}")

y_pred = grid_search.predict(X_test)
print("Reporte de clasificación:\n", classification_report(y_test, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))

# Guardar modelo
joblib.dump(grid_search.best_estimator_, '../model/breast_cancer_model.pkl')
print("Modelo guardado exitosamente en '../model/breast_cancer_model.pkl'")


Columna 'Unnamed: 32' eliminada.
No se encontraron columnas con varianza cero.
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Mejores hiperparámetros: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 100}
Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Matriz de confusión:
 [[70  1]
 [ 3 40]]
Modelo guardado exitosamente en '../model/breast_cancer_model.pkl'


In [4]:
print(X_train.columns.tolist())


['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
