# 0. Import Packages

In [7]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [10]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

In [28]:
from scipy.stats import chi2_contingency, pointbiserialr

# 1. Load Dataset 

In [8]:
# Load cancer dataset
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

# Target variable to the DataFrame
cancer_df['target'] = cancer.target

print("Shape of The Dataset : ", cancer_df.shape)
cancer_df.head(3)


Shape of The Dataset :  (569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


# 2. The Split

In [11]:
# Split data into training and testing sets
train, test = train_test_split(cancer_df, test_size=0.3, random_state=99)

print("Training data shape:", train.shape)
print("Testing data shape:", test.shape)


Training data shape: (398, 31)
Testing data shape: (171, 31)


# 3. Data Exploration

In [17]:
        
def feature_association(df, target, feature):        
        contingency_table = pd.crosstab(df[target], pd.cut(df[feature], bins=10))
        chi2, p, dof, expected = chi2_contingency(contingency_table)

        n = contingency_table.sum().sum()
        cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

        # print("Cramer's Value : ",cramers_v)

        # # Results
        # print("Chi-Square Statistic : ", chi2)
        # print("P-value : ", p)
        # print("Degrees of Freedom : ", dof)
        return cramers_v, chi2, p, dof


In [27]:
association_mat = []
for feature in train.columns:
    if feature != 'target' and feature != 'label':
        # print(feature, ' : ')
        cramers_v, chi2, p, dof = feature_association(train, 'target', feature)
        association_mat.append([feature, cramers_v, chi2, p, dof])


association_pd = pd.DataFrame(association_mat, columns=["Feature","Cramer's Value", "Chi Sq. Stat.", "P-value", "Degree of Freedom"])
association_pd.sort_values(by = "P-value")

Unnamed: 0,Feature,Cramer's Value,Chi Sq. Stat.,P-value,Degree of Freedom
27,worst concave points,0.854649,290.709296,2.4358259999999998e-57,9
22,worst perimeter,0.84262,282.583333,1.283475e-55,9
20,worst radius,0.826639,271.966248,2.270242e-53,9
7,mean concave points,0.82345,269.871993,6.297546e-53,9
23,worst area,0.800114,254.792667,1.659821e-50,8
2,mean perimeter,0.784901,245.195931,1.030151e-47,9
0,mean radius,0.765919,233.479525,3.042918e-45,9
6,mean concavity,0.763352,231.917026,6.493293e-45,9
3,mean area,0.752396,225.307895,2.908814e-44,8
26,worst concavity,0.737647,216.560928,1.106133e-41,9


Other than, "texture_error" & "smoothness_error", other columns are significant

In [29]:
# Calculate point-biserial correlation coefficient
point_biserial_corr, p_value = pointbiserialr(cancer_df['target'], cancer_df["worst concave points"])

print("Point-Biserial Correlation Coefficient:", point_biserial_corr)
print("P-value:", p_value)

Point-Biserial Correlation Coefficient: -0.7935660171412691
P-value: 1.9690997072187384e-124
