# 0. Import Packages

In [4]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [5]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

In [14]:
from imblearn.over_sampling import SMOTE

In [7]:
from scipy.stats import chi2_contingency, pointbiserialr

# 1. Load Dataset 

In [8]:
# Load cancer dataset
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

# Target variable to the DataFrame
cancer_df['target'] = cancer.target

print("Shape of The Dataset : ", cancer_df.shape)
cancer_df.head(3)


Shape of The Dataset :  (569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


# 2. The Split

In [9]:
# Split data into training and testing sets
train, test = train_test_split(cancer_df, test_size=0.3, random_state=99)

print("Training data shape:", train.shape)
print("Testing data shape:", test.shape)


Training data shape: (398, 31)
Testing data shape: (171, 31)


# 3. Data Exploration

## 3.1 Target Split & Balance

In [12]:
train["target"].value_counts()

target
1    244
0    154
Name: count, dtype: int64

In [15]:
smote = SMOTE(random_state=99)
X_train, y_train = smote.fit_resample(train.drop("target", axis=1), train["target"])
y_train.value_counts()

target
1    244
0    244
Name: count, dtype: int64

## 3.2 Null Value Analysis

In [17]:
X_train.isnull().sum().sum()

0

No Analysis Required

## 3.3 Important Features & P-Value

In [10]:
def point_biserial_coeff(df, binary_cat, continuous_cat):
        '''
        Measures the correlation between a continuous variable and a binary variable.
        Appropriate when one variable is continuous and the other is dichotomous (binary).
        Ranges from -1 to 1, where -1 indicates a perfect negative correlation, 1 indicates a perfect positive correlation, and 0 indicates no correlation.
        Interpreted similar to Pearson correlation coefficient (i.e., strength and direction of linear relationship).
        '''
        point_biserial_corr, p_value = pointbiserialr(df[binary_cat], df[continuous_cat])
        return point_biserial_corr, p_value

In [11]:
association_mat = []
for feature in train.columns:
    if feature != 'target' and feature != 'label':
        # print(feature, ' : ')
        corr, p_value = point_biserial_coeff(train, 'target', feature)
        association_mat.append([feature, corr, p_value])


association_pd = pd.DataFrame(association_mat, columns=["Feature","Point-Biserial Coef.", "P-value"])
association_pd.sort_values(by = "P-value")

Unnamed: 0,Feature,Point-Biserial Coef.,P-value
27,worst concave points,-0.790124,3.340373e-86
22,worst perimeter,-0.780101,1.143069e-82
20,worst radius,-0.772451,4.292594e-80
7,mean concave points,-0.769577,3.750747e-79
2,mean perimeter,-0.74062,2.245303e-70
0,mean radius,-0.728663,4.372977e-67
23,worst area,-0.727402,9.493605e-67
3,mean area,-0.708006,8.485544e-62
6,mean concavity,-0.69065,1.062156e-57
26,worst concavity,-0.643659,6.206875e-48


Other than, "texture_error", "symmetry_error" and "mean fractal error" other columns are significant. 

In [19]:
train_log = np.log(train.drop('target', axis=1) + 1)  # Adding 1 to avoid log(0) if any value is 0
train_log["target"] =  train["target"]
train_log.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,2.700225,2.985248,4.510703,6.384107,0.092348,0.099519,0.083982,0.048135,0.166619,0.060999,0.330601,0.764605,1.272084,3.445645,0.007067,0.025192,0.031188,0.011771,0.020677,0.003805,2.822912,3.251321,4.65094,6.636414,0.124586,0.221193,0.231054,0.10815,0.254511,0.080729,0.613065
std,0.22487,0.211701,0.250657,0.488315,0.01325,0.046757,0.069054,0.036142,0.023316,0.006707,0.172584,0.230515,0.406253,0.726889,0.003104,0.016867,0.02567,0.005756,0.00846,0.002454,0.265644,0.235086,0.293085,0.565601,0.02091,0.11818,0.154613,0.058669,0.047834,0.016831,0.487662
min,2.077064,2.431857,3.801985,4.97328,0.062308,0.02317,0.0,0.0,0.110378,0.048752,0.10571,0.307632,0.563608,2.107543,0.001712,0.003007,0.0,0.0,0.007851,0.000894,2.189416,2.601949,3.939833,5.226821,0.068752,0.026924,0.0,0.0,0.145398,0.053579,0.0
25%,2.542389,2.842144,4.334803,6.043464,0.082927,0.064865,0.029646,0.020533,0.150358,0.056113,0.210605,0.600209,0.958391,2.939155,0.005222,0.012785,0.015851,0.007978,0.015051,0.002235,2.6428,3.083285,4.445294,6.248091,0.11002,0.137782,0.11353,0.061596,0.223703,0.068817,0.0
50%,2.673459,2.977313,4.481306,6.327311,0.092055,0.091521,0.065095,0.036559,0.16543,0.05981,0.287845,0.725372,1.202058,3.246491,0.006342,0.020694,0.025931,0.011251,0.018851,0.003211,2.773838,3.260401,4.594615,6.542884,0.123588,0.196019,0.208314,0.097081,0.248148,0.077044,1.0
75%,2.848102,3.122145,4.6819,6.706493,0.100818,0.122726,0.12617,0.072941,0.178397,0.064296,0.402333,0.906139,1.476417,3.871346,0.008166,0.032837,0.040799,0.014832,0.023384,0.004549,3.013324,3.421733,4.863101,7.052937,0.138043,0.293863,0.328332,0.153386,0.277329,0.088196,1.0
max,3.371082,3.695855,5.244389,7.824846,0.151347,0.296691,0.355434,0.183321,0.265436,0.091439,1.354029,1.772407,3.134624,6.297478,0.030655,0.101112,0.265283,0.040086,0.075988,0.022603,3.611998,3.874529,5.530222,8.35585,0.20098,0.721735,0.811819,0.255417,0.455778,0.188552,1.0


In [None]:
association_mat = []
for feature in train.columns:
    if feature != 'target' and feature != 'label':
        # print(feature, ' : ')
        corr, p_value = point_biserial_coeff(train, 'target', feature)
        association_mat.append([feature, corr, p_value])


association_pd = pd.DataFrame(association_mat, columns=["Feature","Point-Biserial Coef.", "P-value"])
association_pd.sort_values(by = "P-value")