## In this notebook, we will perform some statistical tests on preprocessed breast cancer dataset

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import norm, shapiro, levene

In [2]:
# Load the dataset
file_path = '/kaggle/input/breastcancer-preprocessed/BreastCancer-Preprocessed.csv'
df = pd.read_csv(file_path)

##  Display the first few rows of the dataset to understand its structure

In [3]:
df.head()

Unnamed: 0,id,age,pathsize,lnpos,histgrad,er,pr,status,time,lnpos_YN,pathsize_Cat
0,1.0,60.0,,0.0,3.0,0.0,0.0,0.0,9.466667,No,
1,2.0,79.0,,0.0,,,,0.0,8.6,No,
2,3.0,82.0,,0.0,2.0,,,0.0,19.333333,No,
3,4.0,66.0,,0.0,2.0,1.0,1.0,0.0,16.333333,No,
4,5.0,52.0,,0.0,3.0,,,0.0,8.5,No,


## Understanding the Variables

The `lnpos_YN` and `status` are categorical variables that we want to test for independence.
Let's take a look at the unique values in these columns.

In [4]:
# Check the unique values in 'lnpos_YN' and 'status'
print("Unique values in 'lnpos_YN':", df['lnpos_YN'].unique())
print("Unique values in 'status':", df['status'].unique())


Unique values in 'lnpos_YN': ['No' 'Yes']
Unique values in 'status': [0. 1.]


## Creating a Contingency Table

We will create a contingency table to summarize the relationship between the two categorical variables.

In [5]:
# Create a contingency table
contingency_table = pd.crosstab(df['lnpos_YN'], df['status'])
print(contingency_table)


status    0.0  1.0
lnpos_YN          
No        887   42
Yes       248   30


In [6]:
row_percentage = contingency_table.apply(lambda x: x / x.sum() * 100, axis=1)
row_percentage

status,0.0,1.0
lnpos_YN,Unnamed: 1_level_1,Unnamed: 2_level_1
No,95.47901,4.52099
Yes,89.208633,10.791367


## Performing the Chi-Square Test for Independence

The Chi-Square Test for Independence will help us determine whether there is a significant association between `lnpos_YN` and `status`.


In [7]:
# Perform the Chi-Square Test for Independence
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("Chi-Square Statistic:", chi2)
print("***********************************")
print("p-value:", p)
print("***********************************")
print("Degrees of Freedom:", dof)
print("***********************************")
print("Expected Frequencies Table:\n", expected)

Chi-Square Statistic: 13.900758213995182
***********************************
p-value: 0.00019272070767964805
***********************************
Degrees of Freedom: 1
***********************************
Expected Frequencies Table:
 [[873.58326429  55.41673571]
 [261.41673571  16.58326429]]


## Interpreting the Results

- **Chi-Square Statistic**: A measure of how much the observed counts deviate from the expected counts.
- **p-value**: The probability of obtaining a Chi-Square statistic at least as extreme as the one computed, assuming that the variables are independent.
- **Degrees of Freedom**: Number of values that are free to vary given the constraints.
- **Expected Frequencies Table**: The expected counts if the variables are truly independent.

If the p-value is less than our significance level (typically 0.05), we reject the null hypothesis and conclude that there is a significant association between `lnpos_YN` and `status`.


## Calculate the residuals

In [8]:
# Calculate the residuals
residuals = (contingency_table - expected)
print("Residuals:\n", residuals)

Residuals:
 status          0.0        1.0
lnpos_YN                      
No        13.416736 -13.416736
Yes      -13.416736  13.416736


## Interpreting the Residuals

Residuals show the difference between observed and expected frequencies, standardized by the expected frequency. Large residuals indicate cells that contribute significantly to the Chi-Square statistic.


## Calculating the Odds Ratio

The odds ratio will provide a measure of association between the two categorical variables. Note that the odds ratio is only appropriate for 2x2 tables.


In [9]:
   # Check if the table is 2x2
if contingency_table.shape == (2, 2):
    # Extract values from the contingency table
    a = contingency_table.iloc[0, 0]
    b = contingency_table.iloc[0, 1]
    c = contingency_table.iloc[1, 0]
    d = contingency_table.iloc[1, 1]
    
    # Calculate the Odds Ratio
    odds_ratio = (a / b) / (c / d)
    
    log_OddsRatio = np.log(odds_ratio)
    _log_or = np.sqrt(1/a + 1/b + 1/c + 1/d)
    
    # 95% Confidence interval for odds ratio
    z = norm.ppf(0.975)
    CI_lower = np.exp(log_OddsRatio - z * _log_or)
    CI_upper = np.exp(log_OddsRatio + z * _log_or)
    
    # Output results
    print(f"Odds Ratio: {odds_ratio:.2f}")
    print(f"95% Confidence Interval (CI): ({CI_lower:.4f}, {CI_upper:.4f})")
     

Odds Ratio: 2.55
95% Confidence Interval (CI): (1.5663, 4.1668)


## Interpreting the Results

- **Odds Ratio**: Measure of association between the two variables, applicable for 2x2 tables.
- **95% Confidence interval for odds ratio** : A confidence interval (CI) provides a range of values that is likely to contain the population parameter with a certain level of confidence (usually 95%). 


## The Fisher Exact Test 

In [10]:
# Fisher is typically used when sample sizes are small, especially when the expected frequency in any of the cells of a contingency table is less than 5
# Check if any expected frequency is less than 5

if (expected < 5).any():
    print("At least one expected frequency is less than 5. Performing Fisher Exact Test.")
    
    # Fisher Exact Test can only be performed on 2x2 tables
    if contingency_table.shape == (2, 2):
        oddsratio, p_fisher = stats.fisher_exact(contingency_table)
        print("Fisher Exact Test p-value:", p_fisher)
    else:
        print("Fisher Exact Test is not applicable for tables larger than 2x2.")
else:
    print("All expected frequencies are 5 or greater. Performing Chi-Square Test for Independence.")


All expected frequencies are 5 or greater. Performing Chi-Square Test for Independence.


## Independent t test

In [11]:
df.lnpos_YN.value_counts()


lnpos_YN
No     929
Yes    278
Name: count, dtype: int64

In [12]:
df[df['lnpos_YN'] == 'Yes']['time']

8        27.633333
9        11.133333
10       11.066667
20      107.800000
27       48.666667
           ...    
1200     33.700000
1201     32.366667
1202      6.866667
1204     15.166667
1205     45.133333
Name: time, Length: 278, dtype: float64

In [13]:
groupY =df[df['lnpos_YN'] == 'Yes']['time']
groupN = df[df['lnpos_YN'] == 'No']['time']

**Check normality**

In [14]:
sh_lnpose_Y = stats.shapiro(groupY)
sh_lnpose_N = stats.shapiro(groupN)
print(sh_lnpose_N)
print(sh_lnpose_Y)

ShapiroResult(statistic=0.958610475063324, pvalue=1.531057368424826e-15)
ShapiroResult(statistic=0.9339127540588379, pvalue=8.318342126223399e-10)


In [15]:
levene1 = levene(groupN,groupY)
levene1

LeveneResult(statistic=1.1169212722022501, pvalue=0.29079417684064907)

In [16]:
if levene1.pvalue > 0.05 :
    t_statistics , p_value = stats.ttest_ind(groupN,groupY,equal_var=True)
    print('H0 is acceptible and variances are equal in lnpose_YN and Time')
else:
    t_statistics , p_value = stats.ttest_ind(groupN,groupY, equal_var=False)
    print('H1 is acceptible and variances are not equal in lnpose_YN and Time')

H0 is acceptible and variances are equal in lnpose_YN and Time


In [17]:
pathsize_groups = df['pathsize_Cat'].dropna().unique()
pathsize_groups

array(['<= 2', '2 <  <= 5', '> 5'], dtype=object)

In [18]:
df[df['pathsize_Cat'] == pathsize_groups[0]]

Unnamed: 0,id,age,pathsize,lnpos,histgrad,er,pr,status,time,lnpos_YN,pathsize_Cat
86,14.0,44.0,0.10,1.0,,,,0.0,108.900000,Yes,<= 2
87,15.0,60.0,0.15,0.0,1.0,1.0,1.0,0.0,16.366667,No,<= 2
88,16.0,49.0,0.20,0.0,1.0,,,0.0,94.333333,No,<= 2
89,17.0,41.0,0.20,0.0,1.0,,,0.0,87.500000,No,<= 2
90,18.0,39.0,0.26,0.0,2.0,,,0.0,73.033333,No,<= 2
...,...,...,...,...,...,...,...,...,...,...,...
907,1249.0,38.0,2.00,0.0,2.0,,,1.0,25.433333,No,<= 2
908,1252.0,29.0,1.50,1.0,3.0,,0.0,0.0,35.666667,Yes,<= 2
909,1256.0,59.0,1.20,0.0,,1.0,0.0,0.0,133.800000,No,<= 2
910,1261.0,41.0,1.20,0.0,2.0,1.0,1.0,0.0,90.166667,No,<= 2


In [19]:
for level, group in df.groupby('pathsize_Cat'):
     groupN = group[group['lnpos_YN'] == 'No']['time']
     groupY = group[group['lnpos_YN'] == 'Yes']['time']
 

In [20]:
shapiro_pathsize_group1 = shapiro(groupN)
shapiro_pathsize_group2 = shapiro(groupY)

# Check homogeneity of variances for pathsize_Cat groups
levene_pathsize = levene(groupN, groupY)

# Perform pairwise t-tests based on Levene's test result
if levene_pathsize.pvalue > 0.05:
    # Use equal variances
    t_statistic_1, p_value_1 = stats.ttest_ind(groupN, groupY, equal_var=True)
else:
    # Use unequal variances
    t_statistic_1, p_value_1 = stats.ttest_ind(groupN, groupY, equal_var=False)

# Print results


print(f'Shapiro test for pathsize_Cat group1: {shapiro_pathsize_group1}')
print(f'Shapiro test for pathsize_Cat group2: {shapiro_pathsize_group2}')
print(f'Levene test for pathsize_Cat: {levene_pathsize}')
print(f'T-Test for pathsize_Cat: t_statistic = {t_statistic_1}, p_value = {p_value_1}')

Shapiro test for pathsize_Cat group1: ShapiroResult(statistic=0.9486790895462036, pvalue=0.7078753113746643)
Shapiro test for pathsize_Cat group2: ShapiroResult(statistic=0.9288694262504578, pvalue=0.5058420300483704)
Levene test for pathsize_Cat: LeveneResult(statistic=0.05583725989193762, pvalue=0.8179702620924999)
T-Test for pathsize_Cat: t_statistic = -0.3014836920469003, p_value = 0.7692217601243535
