In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Breast Cancer Dataset

Based on Kaggle competition https://www.kaggle.com/competitions/breast-cancer-detection/overview

In [None]:
X_raw = pd.read_csv('../data/train.csv')

We've loaded the **train** dataset. Let's take a look at our data and see which columns are unnecessary.

In [4]:
X_raw['diagnosis'].value_counts()

diagnosis
B    242
M    156
Name: count, dtype: int64

Class imbalance is present in the dataset, but AdaBoost's design allows it to perform well despite this challenge.

In [5]:
X_raw.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,...,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,0.0
mean,29837340.0,14.183236,19.274171,92.345377,660.460553,0.09654,0.104395,0.090043,0.049586,0.181707,...,25.607663,107.907688,891.912563,0.132461,0.254167,0.276683,0.116187,0.291139,0.083802,
std,127847600.0,3.548961,4.119494,24.438949,351.641745,0.014136,0.051625,0.080718,0.039031,0.027905,...,5.947946,33.937742,577.586383,0.02198,0.148297,0.209022,0.064881,0.060838,0.016672,
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.1167,...,12.02,50.41,185.2,0.08125,0.03432,0.0,0.0,0.1565,0.05504,
25%,866684.0,11.7175,16.345,75.4675,421.275,0.086755,0.066712,0.03009,0.02071,0.161925,...,21.3925,84.43,516.425,0.117275,0.147325,0.117125,0.0656,0.252625,0.07231,
50%,905236.0,13.445,18.895,86.965,556.95,0.095825,0.09474,0.064905,0.03395,0.18005,...,25.3,99.165,700.65,0.13135,0.217,0.2389,0.10535,0.2828,0.07996,
75%,8810979.0,16.115,21.695,106.525,802.025,0.106075,0.1305,0.13205,0.074112,0.19655,...,29.425,127.1,1143.5,0.1459,0.342675,0.3864,0.166075,0.3206,0.092075,
max,911320500.0,27.42,33.81,186.9,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,49.54,251.2,4254.0,0.2006,0.9327,1.252,0.291,0.5774,0.1486,


All numerical columns except _**Unnamed: 32**_ have some values, so they can be used. _**Unnamed: 32**_ can be dropped because it contains only **NaN** values and is not even described in the dataset instructions.

In [6]:
X_raw = X_raw.drop('Unnamed: 32', axis=1)
X_raw = X_raw.drop('id', axis=1)

In [7]:
X_raw.dtypes

radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst      

In [8]:
X_raw['diagnosis'].unique()

array(['M', 'B'], dtype=object)

Now let's deal with the **diagnosis** column; it contains two unique values (**M** = malignant, **B** = benign). As such data have an order, like *benign* being less dangerous than *malignant*, we can use `OrdinalEncoder`.

In [9]:
from sklearn.preprocessing import  OrdinalEncoder
enc = OrdinalEncoder()
X_raw['diagnosis_e'] = enc.fit_transform(X_raw[['diagnosis']])
X_raw[['diagnosis', 'diagnosis_e']].head()

Unnamed: 0,diagnosis,diagnosis_e
0,M,1.0
1,B,0.0
2,M,1.0
3,B,0.0
4,M,1.0


Now **B** -> `0` and **M** -> `1`

In [10]:
corr = X_raw.select_dtypes(include=['float64', 'int64']).corr().round(2)
px.imshow(corr, text_auto=True).show()


In [11]:
X_train = X_raw.drop(['diagnosis', 'diagnosis_e'], axis=1)
y_train = X_raw['diagnosis']

 Let's try **Random Forest** and **AdaBoost**.

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
clf = RandomForestClassifier(max_depth=2, random_state=42)
cv_results = cross_validate(clf, X_train, y_train, cv=4)
cv_results


{'fit_time': array([0.47401047, 0.48507547, 0.49155712, 0.49288893]),
 'score_time': array([0.02799296, 0.02995515, 0.03516912, 0.03998566]),
 'test_score': array([0.94      , 0.89      , 0.93939394, 0.96969697])}

**GridSearchCV** helps us find the optimal parameters for our model. However, in some cases, it can cause *overfitting*.

Since the **AdaBoost** classifier gives us better results, we'll use it as our main model.

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [ 30, 50, 70],
    'learning_rate': [  0.1, 0.05],
    'estimator__max_depth': [None,  2, 3]
   # 'base_estimator__min_samples_split': [2, 5, 10]
}
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
ada_clf = AdaBoostClassifier(estimator=tree_clf, n_estimators=50, learning_rate=0.5, random_state=42)
grid_search = GridSearchCV(ada_clf, param_grid, cv=4, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
cv_results = cross_validate(ada_clf, X_train, y_train, cv=4)
cv_results

Best Parameters: {'estimator__max_depth': 2, 'learning_rate': 0.1, 'n_estimators': 70}
Best Score: 0.9548232323232324


{'fit_time': array([0.6495235 , 0.60178471, 0.55623507, 0.62149525]),
 'score_time': array([0.03965902, 0.03642607, 0.04096198, 0.03979897]),
 'test_score': array([0.97      , 0.92      , 0.95959596, 0.96969697])}

Now let's load **test** dataset for submission and make a prediction for it

In [None]:
X_test_full = pd.read_csv('../data/test.csv')
X_test = X_test_full.drop('Unnamed: 32', axis=1)
X_test = X_test.drop('id', axis=1)

Since GridSearch was slightly overfitted, we tuned the learning rate and number of estimators, achieving an accuracy of about _**0.97**_ While GridSearch best estimator gives _**0.953**_.

In [None]:
ada_clf.fit(X_train, y_train)
pred = ada_clf.predict(X_test)
submission = pd.DataFrame({
    'id': X_test_full['id'],
    'diagnosis': pred
})
submission.to_csv('../submissions/ada_submission.csv', index=False)