In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder
)
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [2]:
dataset_path = './Content/breast_cancer.csv'
df = pd.read_csv(
    dataset_path,
    names=[
        'age',
        'meonpause',
        'tumor_size',
        'inv_nodes',
        'node_caps',
        'deg_malig',
        'breast',
        'breast_quad',
        'irradiat',
        'label'
        ]
)
df

Unnamed: 0,age,meonpause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,label
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
...,...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no','no-recurrence-events'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes','no-recurrence-events'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no','no-recurrence-events'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no','no-recurrence-events'


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   meonpause    286 non-null    object
 2   tumor_size   286 non-null    object
 3   inv_nodes    286 non-null    object
 4   node_caps    278 non-null    object
 5   deg_malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast_quad  285 non-null    object
 8   irradiat     286 non-null    object
 9   label        286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB


In [4]:
df.describe()

Unnamed: 0,age,meonpause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,label
count,286,286,286,286,278,286,286,285,286,286
unique,6,3,11,7,2,3,2,5,2,2
top,'50-59','premeno','30-34','0-2','no','2','left','left_low','no','no-recurrence-events'
freq,96,150,60,213,222,130,152,110,218,201


In [6]:
df.head(10)

Unnamed: 0,age,meonpause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,label
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
5,'50-59','premeno','25-29','3-5','no','2','right','left_up','yes','no-recurrence-events'
6,'50-59','ge40','40-44','0-2','no','3','left','left_up','no','no-recurrence-events'
7,'40-49','premeno','10-14','0-2','no','2','left','left_up','no','no-recurrence-events'
8,'40-49','premeno','0-4','0-2','no','2','right','right_low','no','no-recurrence-events'
9,'40-49','ge40','40-44','15-17','yes','2','right','left_up','yes','no-recurrence-events'


In [7]:
df['node_caps'] = df['node_caps'].fillna(
    df['node_caps'].mode()[0]
)

df['breast_quad'] = df['breast_quad'].fillna(
    df['breast_quad'].mode()[0]
)

In [8]:
df.describe()

Unnamed: 0,age,meonpause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,label
count,286,286,286,286,286,286,286,286,286,286
unique,6,3,11,7,2,3,2,5,2,2
top,'50-59','premeno','30-34','0-2','no','2','left','left_low','no','no-recurrence-events'
freq,96,150,60,213,230,130,152,111,218,201


In [19]:
for col_name in df.columns:
    n_uniques = df[col_name].nunique()
    unique_values = df[col_name].unique()
    print(f'Unique values in {col_name}: {n_uniques} -> {unique_values}')

Unique values in age: 6 -> ["'40-49'" "'50-59'" "'60-69'" "'30-39'" "'70-79'" "'20-29'"]
Unique values in meonpause: 3 -> ["'premeno'" "'ge40'" "'lt40'"]
Unique values in tumor_size: 11 -> ["'15-19'" "'35-39'" "'30-34'" "'25-29'" "'40-44'" "'10-14'" "'0-4'"
 "'20-24'" "'45-49'" "'50-54'" "'5-9'"]
Unique values in inv_nodes: 7 -> ["'0-2'" "'3-5'" "'15-17'" "'6-8'" "'9-11'" "'24-26'" "'12-14'"]
Unique values in node_caps: 2 -> ["'yes'" "'no'"]
Unique values in deg_malig: 3 -> ["'3'" "'1'" "'2'"]
Unique values in breast: 2 -> ["'right'" "'left'"]
Unique values in breast_quad: 5 -> ["'left_up'" "'central'" "'left_low'" "'right_up'" "'right_low'"]
Unique values in irradiat: 2 -> ["'no'" "'yes'"]
Unique values in label: 2 -> ["'recurrence-events'" "'no-recurrence-events'"]


In [21]:
non_rank_features = [
    'meonpause',
    'node_caps',
    'breast',
    'breast_quad',
    'irradiat'
]

rank_features = [
    'age',
    'tumor_size',
    'inv_nodes',
    'deg_malig'
]

y = df['label']
X = df.drop('label', axis=1)

In [22]:
X

Unnamed: 0,age,meonpause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no'
...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no'


In [23]:
y

0         'recurrence-events'
1      'no-recurrence-events'
2         'recurrence-events'
3      'no-recurrence-events'
4         'recurrence-events'
                ...          
281    'no-recurrence-events'
282    'no-recurrence-events'
283    'no-recurrence-events'
284    'no-recurrence-events'
285    'no-recurrence-events'
Name: label, Length: 286, dtype: object

In [24]:
transformer = ColumnTransformer(
    transformers=[
        (
            'OneHot',
            OneHotEncoder(drop='first'),
            non_rank_features
        ),
        (
            'Ordinal',
            OrdinalEncoder(),
            rank_features
        )
    ],
    remainder='passthrough'
)
X_transformed = transformer.fit_transform(X)

In [25]:
X_transformed

array([[0., 1., 1., ..., 2., 0., 2.],
       [0., 0., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 6., 0., 1.],
       ...,
       [0., 1., 1., ..., 5., 5., 1.],
       [0., 1., 0., ..., 2., 0., 1.],
       [0., 0., 0., ..., 7., 0., 2.]], shape=(286, 13))

In [27]:
onehot_features = transformer.named_transformers_['OneHot'].get_feature_names_out(non_rank_features)
all_features = onehot_features.tolist() + rank_features

X_encoded = pd.DataFrame(
    X_transformed,
    columns=all_features
)

In [28]:
X_encoded

Unnamed: 0,meonpause_'lt40',meonpause_'premeno',node_caps_'yes',breast_'right',breast_quad_'left_low',breast_quad_'left_up',breast_quad_'right_low',breast_quad_'right_up',irradiat_'yes',age,tumor_size,inv_nodes,deg_malig
0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,6.0,0.0,1.0
3,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,6.0,0.0,2.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,5.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,5.0,5.0,1.0
282,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,4.0,4.0,1.0
283,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,5.0,5.0,1.0
284,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0


In [29]:
normalizer = StandardScaler()
X_normalized = normalizer.fit_transform(X_encoded)

In [30]:
y_encoded = LabelEncoder().fit_transform(y)

In [31]:
y_encoded

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
val_size = 0.2
test_size = 0.125
random_state = 0
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X_normalized, y_encoded,
    test_size=val_size,
    shuffle=is_shuffle,
    random_state=random_state,
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    shuffle=is_shuffle,
    random_state=random_state,
)

In [33]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')
print(f'Number of test samples: {X_test.shape[0]}')

Number of training samples: 199
Number of validation samples: 58
Number of test samples: 29


In [34]:
classifier = SVC(
    random_state=random_state
)
classifier.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [35]:
y_val_pred = classifier.predict(X_val)
y_test_pred = classifier.predict(X_test)
val_scores = accuracy_score(y_val, y_val_pred)
test_scores = accuracy_score(y_test, y_test_pred)

print(f'Evaluation results on validation and test set:')
print(f'Val Accuracy: {val_scores}')
print(f'Test Accuracy: {test_scores}')

Evaluation results on validation and test set:
Val Accuracy: 0.7241379310344828
Test Accuracy: 0.7586206896551724
