### Voronyi-Stepan-Camp-2025
### Brest Cancer Dataset

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
labels, features = cancer.target_names, cancer.feature_names
print ('labels:',labels)
print ('features:', features)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


labels: ['malignant' 'benign']
features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


### Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf.fit(X_train, y_train)
print("train accuracy = {:.3%}".format(clf.score(X_train, y_train)))
print("test accuracy = {:.3%}".format(clf.score(X_test, y_test)))
dot_data = export_graphviz(
    clf,
    out_file=None,
    feature_names=features,
    class_names=labels,
    filled=True,
    rounded=True,
    special_characters=True
)
graph = graphviz.Source(dot_data)
graph.view(cleanup=True)


train accuracy = 100.000%
test accuracy = 94.406%


'Source.gv.pdf'

### Random Forest

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
def create_rf_pipeline():
    pipeline = Pipeline([
        ('rf_classifier', RandomForestClassifier())
    ])
    return pipeline
rf_pipeline = create_rf_pipeline()
rf_pipeline.fit(X_train, y_train)
print("train accuracy= {:.3%}".format(rf_pipeline.score(X_train, y_train)))
print("test accuracy= {:.3%}".format(rf_pipeline.score(X_test, y_test)))


train accuracy= 100.000%
test accuracy= 97.902%


### Gradient Boosting Decision Trees (GBDT)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
def create_gbdt_pipeline():
    gbdt_model = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=42)
    pipeline = Pipeline([
        ('gbdt', gbdt_model)
    ])
    return pipeline
gbdt_pipeline = create_gbdt_pipeline()
gbdt_pipeline.fit(X_train, y_train)
print("Train accuracy= {:.3%}".format(gbdt_pipeline.score(X_train, y_train)))
print("Test accuracy= {:.3%}".format(gbdt_pipeline.score(X_test, y_test)))


Train accuracy= 100.000%
Test accuracy= 95.804%


### XGBoost

In [12]:
from xgboost import XGBClassifier
def build_and_train_xgb(X_train, y_train):
    model = XGBClassifier(eval_metric='logloss', random_state=1)
    model.fit(X_train, y_train)
    return model
clf = build_and_train_xgb(X_train, y_train)
print("Train accuracy= {:.3%}".format(clf.score(X_train, y_train)))
print("Test accuracy= {:.3%}".format(clf.score(X_test, y_test)))



Train accuracy= 100.000%
Test accuracy= 98.601%
