In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [2]:
df = pd.read_csv('novo_data_cancer.csv', sep = ';', encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
X = df.drop('diagnosis', axis=1) #previsores 
y = df['diagnosis'].map({'B': 0, 'M': 1})

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 

In [6]:
X_train.shape

(398, 30)

In [7]:
X_test.shape

(171, 30)

In [8]:
y_train.shape

(398,)

In [9]:
y_test.shape

(171,)

In [10]:
#Criei uma pipeline com o XGBC 
pipeline = Pipeline([
    ('model', XGBClassifier(
        max_depth=2,          # ↓
        learning_rate=0.05,
        n_estimators=200,     # ↓
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        random_state=0,
        eval_metric='logloss'
    ))
])


In [21]:
#pipeline treinada
pipeline.fit(X_train, y_train)

In [12]:
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

print("Acurácia:", accuracy_score(y_test, y_pred))
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, y_pred))
print("\nRelatório:")
print(classification_report(y_test, y_pred))
print("\nROC AUC:", roc_auc_score(y_test, y_prob))

Acurácia: 0.9590643274853801

Matriz de Confusão:
[[105   3]
 [  4  59]]

Relatório:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       108
           1       0.95      0.94      0.94        63

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171


ROC AUC: 0.9963256907701352


In [14]:
# Previsões no treino
y_pred_train = pipeline.predict(X_train)
y_prob_train = pipeline.predict_proba(X_train)[:, 1]

print("Acurácia (treino):", accuracy_score(y_train, y_pred_train))
print("\nMatriz de Confusão (treino):")
print(confusion_matrix(y_train, y_pred_train))
print("\nRelatório (treino):")
print(classification_report(y_train, y_pred_train))
print("\nROC AUC (treino):", roc_auc_score(y_train, y_prob_train))

Acurácia (treino): 0.9899497487437185

Matriz de Confusão (treino):
[[249   0]
 [  4 145]]

Relatório (treino):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       249
           1       1.00      0.97      0.99       149

    accuracy                           0.99       398
   macro avg       0.99      0.99      0.99       398
weighted avg       0.99      0.99      0.99       398


ROC AUC (treino): 0.9994339775208215


In [22]:
# Validação cruzada com 5 folds
scores = cross_val_score(
    pipeline,
    X,
    y,
    cv=5,
    scoring='accuracy'
)

print("ROC AUC médio:", scores.mean())
print("Acurácia em cada fold:", scores)
print("Acurácia média:", scores.mean())
print("Desvio padrão:", scores.std())

ROC AUC médio: 0.9683900015525537
Acurácia em cada fold: [0.94736842 0.96491228 0.98245614 0.96491228 0.98230088]
Acurácia média: 0.9683900015525537
Desvio padrão: 0.013095532590146609
