## 0. Download dataset
**Note:** If you can't download using gdown due to limited number of downloads, please download it manually and upload it to your drive, then copy it from the drive to colab.
```python
from google.colab import drive

drive.mount('/content/drive')
!cp /path/to/dataset/on/your/drive .
```

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 2. Load dataset

In [3]:
dataset_path = '/content/drive/Othercomputers/My Laptop/Advanced/Exercises/230927 - Support Vector Machine Solution - TA_Thang/breast-cancer.csv'
df = pd.read_csv(
    dataset_path,
    names=[
        'age',
        'meonpause',
        'tumor-size',
        'inv-nodes',
        'node-caps',
        'deg-malig',
        'breast',
        'breast-quad',
        'irradiat',
        'label'
    ]
)
df

Unnamed: 0,age,meonpause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,label
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
...,...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no','no-recurrence-events'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes','no-recurrence-events'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no','no-recurrence-events'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no','no-recurrence-events'


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   meonpause    286 non-null    object
 2   tumor-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiat     286 non-null    object
 9   label        286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB


In [5]:
df.describe()

Unnamed: 0,age,meonpause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,label
count,286,286,286,286,278,286,286,285,286,286
unique,6,3,11,7,2,3,2,5,2,2
top,'50-59','premeno','30-34','0-2','no','2','left','left_low','no','no-recurrence-events'
freq,96,150,60,213,222,130,152,110,218,201


## 3. Preprocess dataset

### 3.1. Filling missing values

In [6]:
df['node-caps'] = df['node-caps'].fillna(df['node-caps'].mode()[0])
df['breast-quad'] = df['breast-quad'].fillna(df['breast-quad'].mode()[0])

In [7]:
df.describe()

Unnamed: 0,age,meonpause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,label
count,286,286,286,286,286,286,286,286,286,286
unique,6,3,11,7,2,3,2,5,2,2
top,'50-59','premeno','30-34','0-2','no','2','left','left_low','no','no-recurrence-events'
freq,96,150,60,213,230,130,152,111,218,201


### 3.2. Encode categorical features

In [8]:
for col_name in df.columns:
    n_uniques = df[col_name].unique()
    print(f'Unique values in {col_name}: {n_uniques}')

Unique values in age: ["'40-49'" "'50-59'" "'60-69'" "'30-39'" "'70-79'" "'20-29'"]
Unique values in meonpause: ["'premeno'" "'ge40'" "'lt40'"]
Unique values in tumor-size: ["'15-19'" "'35-39'" "'30-34'" "'25-29'" "'40-44'" "'10-14'" "'0-4'"
 "'20-24'" "'45-49'" "'50-54'" "'5-9'"]
Unique values in inv-nodes: ["'0-2'" "'3-5'" "'15-17'" "'6-8'" "'9-11'" "'24-26'" "'12-14'"]
Unique values in node-caps: ["'yes'" "'no'"]
Unique values in deg-malig: ["'3'" "'1'" "'2'"]
Unique values in breast: ["'right'" "'left'"]
Unique values in breast-quad: ["'left_up'" "'central'" "'left_low'" "'right_up'" "'right_low'"]
Unique values in irradiat: ["'no'" "'yes'"]
Unique values in label: ["'recurrence-events'" "'no-recurrence-events'"]


In [9]:
non_rank_features = ['meonpause', 'node-caps', 'breast', 'breast-quad', 'irradiat']
rank_features = ['age', 'tumor-size', 'inv-nodes', 'deg-malig']

y = df['label']
X = df.drop('label', axis=1)

In [10]:
transformer = ColumnTransformer(
    transformers=[
        ("OneHot", OneHotEncoder(drop='first'), non_rank_features),
        ("Ordinal", OrdinalEncoder(), rank_features)
    ],
    remainder='passthrough'
)
X_transformed = transformer.fit_transform(X)

onehot_features = transformer.named_transformers_['OneHot'].get_feature_names_out(non_rank_features)
all_features = onehot_features.tolist() + rank_features

X_encoded = pd.DataFrame(
    X_transformed,
    columns=all_features
)

In [11]:
X_encoded

Unnamed: 0,meonpause_'lt40',meonpause_'premeno',node-caps_'yes',breast_'right',breast-quad_'left_low',breast-quad_'left_up',breast-quad_'right_low',breast-quad_'right_up',irradiat_'yes',age,tumor-size,inv-nodes,deg-malig
0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,6.0,0.0,1.0
3,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,6.0,0.0,2.0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,5.0,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,5.0,5.0,1.0
282,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,4.0,4.0,1.0
283,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,5.0,5.0,1.0
284,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0


### 3.3. Encode label

In [12]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### 3.4. Normalization

In [13]:
normalizer = StandardScaler()
X_normalized = normalizer.fit_transform(X_encoded)

## 4. Train test split

In [14]:
test_size = 0.3
random_state = 1
is_shuffle = True
X_train, X_val, y_train, y_val = train_test_split(
    X_normalized, y_encoded,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)

In [15]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of val samples: {X_val.shape[0]}')

Number of training samples: 200
Number of val samples: 86


## 5. Training

In [16]:
classifier = SVC(
    random_state=random_state
)
classifier.fit(X_train, y_train)

## 6. Evaluation

In [17]:
y_pred = classifier.predict(X_val)
scores = accuracy_score(y_pred, y_val)

print('Evaluation results on validation set:')
print(f'Accuracy: {scores}')

Evaluation results on validation set:
Accuracy: 0.686046511627907
