# Cancer Prediction

## Import Data

In [1]:
import pandas as pd

In [2]:
DATASET_PATH = "./cancerdata.csv"

In [3]:
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,id,diagnosis,Sex,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,F,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,M,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,M,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,F,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,F,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


### Remove ID column

In [4]:
df.drop(columns=['id'], inplace=True)

## Data Exploration

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   diagnosis          569 non-null    object 
 1   Sex                569 non-null    object 
 2   radius_mean        569 non-null    float64
 3   texture_mean       569 non-null    float64
 4   perimeter_mean     569 non-null    float64
 5   area_mean          569 non-null    float64
 6   smoothness_mean    569 non-null    float64
 7   compactness_mean   569 non-null    float64
 8   concavity_mean     567 non-null    float64
 9   points_mean        568 non-null    float64
 10  symmetry_mean      568 non-null    float64
 11  dimension_mean     569 non-null    float64
 12  radius_se          569 non-null    float64
 13  texture_se         569 non-null    float64
 14  perimeter_se       569 non-null    float64
 15  area_se            569 non-null    float64
 16  smoothness_se      569 non

### Missing Values

In [6]:
def missing_value_report(dataframe: pd.DataFrame) -> dict[str, float]:
    """ Returns the percentage of missing values per column """
    missing_values_percentage = 100 * dataframe.isna().sum(axis='index') / dataframe.count()
    return missing_values_percentage[missing_values_percentage > 0].to_dict()


display(missing_value_report(df))

{'concavity_mean': 0.3527336860670194,
 'points_mean': 0.176056338028169,
 'symmetry_mean': 0.176056338028169}

In [7]:
# Fill the numerical columns
NUMERICAL_COLUMNS = df.select_dtypes(include=['number']).columns
numerical_fill_value = df[NUMERICAL_COLUMNS].median()
df[NUMERICAL_COLUMNS] = df[NUMERICAL_COLUMNS].fillna(numerical_fill_value)

# Fill the categorical columns
CATEGORICAL_COLUMNS = df.select_dtypes(include=['object']).columns
categorical_fill_value = df[CATEGORICAL_COLUMNS].mode()
df[CATEGORICAL_COLUMNS] = df[CATEGORICAL_COLUMNS].fillna(categorical_fill_value)

In [8]:
missing_value_report(df)

{}

__All the missing values have been imputed and now our dataframe has zero missing values__

### Visual Exploration

In [9]:
import plotly.express as px

In [10]:
fig = px.pie(df, names='Sex', title='Gender Distribution', height=400, width=400)
fig.show()

In [11]:
fig = px.pie(df, names='diagnosis', title='Class Imbalance', height=400, width=400)
fig.show()

## Data Scaling & Encoding

In [12]:
import numpy as np

# Standardize the numerical columns so that every numerical column
# has Mean = 0 and Standard Deviation = 1

df[NUMERICAL_COLUMNS] = (df[NUMERICAL_COLUMNS] - df[NUMERICAL_COLUMNS]) / df[NUMERICAL_COLUMNS].std()

# Encodes the categorical variables
for col in CATEGORICAL_COLUMNS:
    unique_cats = df[col].unique()
    mapping = {category: i for i, category in enumerate(unique_cats)}
    df[col] = df[col].map(mapping)


# Reduce the datatypes
for column, dtype in zip(df.columns, df.dtypes.values):
    if dtype == 'float64':
        df[column] = df[column].astype(np.float16)
    if dtype == 'int64':
        df[column] = df[column].astype(np.uint8)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   diagnosis          569 non-null    uint8  
 1   Sex                569 non-null    uint8  
 2   radius_mean        569 non-null    float16
 3   texture_mean       569 non-null    float16
 4   perimeter_mean     569 non-null    float16
 5   area_mean          569 non-null    float16
 6   smoothness_mean    569 non-null    float16
 7   compactness_mean   569 non-null    float16
 8   concavity_mean     569 non-null    float16
 9   points_mean        569 non-null    float16
 10  symmetry_mean      569 non-null    float16
 11  dimension_mean     569 non-null    float16
 12  radius_se          569 non-null    float16
 13  texture_se         569 non-null    float16
 14  perimeter_se       569 non-null    float16
 15  area_se            569 non-null    float16
 16  smoothness_se      569 non

## Model Training

In [13]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

TARGET = "diagnosis"





In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=TARGET), df.loc[:, TARGET], test_size=0.2, random_state=1337)

In [15]:
settings = dict(
    generations = 50,
    population_size = 50,
    scoring = 'f1_weighted',
    cv = 5,
    subsample = 0.5,
    n_jobs = -1,
    verbosity = 2,
    random_state = 1337
)

tpot_model = TPOTClassifier(**settings)
tpot_model.fit(X_train, y_train)

                                                                               
Generation 1 - Current best internal CV score: 0.5645777541607456
                                                                               
Generation 2 - Current best internal CV score: 0.5645777541607456
                                                                               
Generation 3 - Current best internal CV score: 0.5645777541607456
                                                                               
Generation 4 - Current best internal CV score: 0.5645777541607456
                                                                               
Generation 5 - Current best internal CV score: 0.5645777541607456
                                                                               
Generation 6 - Current best internal CV score: 0.5645777541607456
                                                                               
Generation 7 - Current best internal CV scor

In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_test, tpot_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.61      0.70      0.65        69
           1       0.40      0.31      0.35        45

    accuracy                           0.54       114
   macro avg       0.50      0.50      0.50       114
weighted avg       0.53      0.54      0.53       114



In [20]:
# export optimal pipeline
tpot_model.export("tpot_pipeline.py")

In [None]:
# Exported pipeline
import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=1337)

# Average CV score on the training set was: 0.599539016730704
exported_pipeline = make_pipeline(
    Nystroem(gamma=0.15000000000000002, kernel="rbf", n_components=5),
    FastICA(tol=0.0),
    GaussianNB()
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1337)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
