# Import Data

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import chi2_contingency
from scipy import stats
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Breast Cancer Classification/t-Test-PCA-Rfe/'
data = pd.read_csv(file_path + 'wdbc.csv')
data.columns

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimenstion_worst', 'Class'],
      dtype='object')

In [4]:
data.shape[1]

31

# Feature Selection and Extraction

## t-Test

In [5]:
# Separate the features and class label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)


# Calculate the T-Test p-values for each feature
t_values, p_values = np.zeros(X_normalized.shape[1]), np.zeros(X_normalized.shape[1])
for i in range(X_normalized.shape[1]):
    t_values[i], p_values[i] = stats.ttest_ind(X_normalized[y == 0, i], X_normalized[y == 1, i], equal_var=False)

# Rank the features based on their T-Test p-values
sorted_indices = np.argsort(p_values)
feature_ranks = X.columns[sorted_indices]


feature_ranks

Index(['radius_mean', 'concave_points_worst', 'concavity_worst',
       'compactness_worst', 'smoothness_worst', 'area_worst',
       'perimeter_worst', 'texture_worst', 'radius_worst',
       'fractal_dimension_se', 'symmetry_se', 'concave_points_se',
       'concavity_se', 'compactness_se', 'smoothness_se', 'area_se',
       'perimeter_se', 'texture_se', 'radius_se', 'fractal_dimension_mean',
       'symmetry_mean', 'concave_points_mean', 'concavity_mean',
       'compactness_mean', 'smoothness_mean', 'area_mean', 'perimeter_mean',
       'texture_mean', 'symmetry_worst', 'fractal_dimenstion_worst'],
      dtype='object')

In [6]:
# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks})
ranked_features_df.to_csv('feature_ranks_t_test.csv', index=False)

# Rearrange the dataset based on the feature ranks
rearranged_data = data[feature_ranks.tolist() + ['Class']]

# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_t_test.csv', index=False)


In [7]:
ranked_features_df

Unnamed: 0,Feature
0,radius_mean
1,concave_points_worst
2,concavity_worst
3,compactness_worst
4,smoothness_worst
5,area_worst
6,perimeter_worst
7,texture_worst
8,radius_worst
9,fractal_dimension_se


In [8]:
# Showing the dataset
df = pd.read_csv('/content/rearranged_data_t_test.csv')
df

Unnamed: 0,radius_mean,concave_points_worst,concavity_worst,compactness_worst,smoothness_worst,area_worst,perimeter_worst,texture_worst,radius_worst,fractal_dimension_se,...,concave_points_mean,concavity_mean,compactness_mean,smoothness_mean,area_mean,perimeter_mean,texture_mean,symmetry_worst,fractal_dimenstion_worst,Class
0,17.99,0.2654,0.7119,0.66560,0.16220,2019.0,184.60,17.33,25.380,0.006193,...,0.14710,0.30010,0.27760,0.11840,1001.0,122.80,10.38,0.4601,0.11890,M
1,20.57,0.1860,0.2416,0.18660,0.12380,1956.0,158.80,23.41,24.990,0.003532,...,0.07017,0.08690,0.07864,0.08474,1326.0,132.90,17.77,0.2750,0.08902,M
2,19.69,0.2430,0.4504,0.42450,0.14440,1709.0,152.50,25.53,23.570,0.004571,...,0.12790,0.19740,0.15990,0.10960,1203.0,130.00,21.25,0.3613,0.08758,M
3,11.42,0.2575,0.6869,0.86630,0.20980,567.7,98.87,26.50,14.910,0.009208,...,0.10520,0.24140,0.28390,0.14250,386.1,77.58,20.38,0.6638,0.17300,M
4,20.29,0.1625,0.4000,0.20500,0.13740,1575.0,152.20,16.67,22.540,0.005115,...,0.10430,0.19800,0.13280,0.10030,1297.0,135.10,14.34,0.2364,0.07678,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,0.2216,0.4107,0.21130,0.14100,2027.0,166.10,26.40,25.450,0.004239,...,0.13890,0.24390,0.11590,0.11100,1479.0,142.00,22.39,0.2060,0.07115,M
565,20.13,0.1628,0.3215,0.19220,0.11660,1731.0,155.00,38.25,23.690,0.002498,...,0.09791,0.14400,0.10340,0.09780,1261.0,131.20,28.25,0.2572,0.06637,M
566,16.60,0.1418,0.3403,0.30940,0.11390,1124.0,126.70,34.12,18.980,0.003892,...,0.05302,0.09251,0.10230,0.08455,858.1,108.30,28.08,0.2218,0.07820,M
567,20.60,0.2650,0.9387,0.86810,0.16500,1821.0,184.60,39.42,25.740,0.006185,...,0.15200,0.35140,0.27700,0.11780,1265.0,140.10,29.33,0.4087,0.12400,M


## PCA

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [10]:
# Load the dataset
data_PCA = pd.read_csv('rearranged_data_t_test.csv')
data_PCA

Unnamed: 0,radius_mean,concave_points_worst,concavity_worst,compactness_worst,smoothness_worst,area_worst,perimeter_worst,texture_worst,radius_worst,fractal_dimension_se,...,concave_points_mean,concavity_mean,compactness_mean,smoothness_mean,area_mean,perimeter_mean,texture_mean,symmetry_worst,fractal_dimenstion_worst,Class
0,17.99,0.2654,0.7119,0.66560,0.16220,2019.0,184.60,17.33,25.380,0.006193,...,0.14710,0.30010,0.27760,0.11840,1001.0,122.80,10.38,0.4601,0.11890,M
1,20.57,0.1860,0.2416,0.18660,0.12380,1956.0,158.80,23.41,24.990,0.003532,...,0.07017,0.08690,0.07864,0.08474,1326.0,132.90,17.77,0.2750,0.08902,M
2,19.69,0.2430,0.4504,0.42450,0.14440,1709.0,152.50,25.53,23.570,0.004571,...,0.12790,0.19740,0.15990,0.10960,1203.0,130.00,21.25,0.3613,0.08758,M
3,11.42,0.2575,0.6869,0.86630,0.20980,567.7,98.87,26.50,14.910,0.009208,...,0.10520,0.24140,0.28390,0.14250,386.1,77.58,20.38,0.6638,0.17300,M
4,20.29,0.1625,0.4000,0.20500,0.13740,1575.0,152.20,16.67,22.540,0.005115,...,0.10430,0.19800,0.13280,0.10030,1297.0,135.10,14.34,0.2364,0.07678,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,0.2216,0.4107,0.21130,0.14100,2027.0,166.10,26.40,25.450,0.004239,...,0.13890,0.24390,0.11590,0.11100,1479.0,142.00,22.39,0.2060,0.07115,M
565,20.13,0.1628,0.3215,0.19220,0.11660,1731.0,155.00,38.25,23.690,0.002498,...,0.09791,0.14400,0.10340,0.09780,1261.0,131.20,28.25,0.2572,0.06637,M
566,16.60,0.1418,0.3403,0.30940,0.11390,1124.0,126.70,34.12,18.980,0.003892,...,0.05302,0.09251,0.10230,0.08455,858.1,108.30,28.08,0.2218,0.07820,M
567,20.60,0.2650,0.9387,0.86810,0.16500,1821.0,184.60,39.42,25.740,0.006185,...,0.15200,0.35140,0.27700,0.11780,1265.0,140.10,29.33,0.4087,0.12400,M


In [11]:
# Separate the features and class label
X = data_PCA.iloc[:, :-1]
y = data_PCA.iloc[:, -1]

# Normalize the features using StandardScaler
scaler = StandardScaler()

X_normalized = scaler.fit_transform(X)
X_normalized = X
X_normalized = X_normalized.fillna(0)

In [12]:
# Apply PCA to the normalized features
pca = PCA()
pca.fit(X_normalized)

In [13]:
# Calculate the variance of all principal components
variance = pca.explained_variance_ratio_

# Rank the principal components based on their variance
sorted_indices = sorted(range(len(variance)), key=lambda i: variance[i], reverse=True)

# Save the variance values of all principal components to a CSV file
variance_df = pd.DataFrame({'Principal Component': range(1, len(variance)+1), 'Variance': variance[sorted_indices]})
variance_df.to_csv('pca_variance.csv', index=False)

variance_df

Unnamed: 0,Principal Component,Variance
0,1,0.9820447
1,2,0.01617649
2,3,0.001557511
3,4,0.000120932
4,5,8.827245e-05
5,6,6.64884e-06
6,7,4.017137e-06
7,8,8.220172e-07
8,9,3.441353e-07
9,10,1.860187e-07


In [14]:
# Rearrange the dataset based on the variance of the principal components
X_transformed = pca.transform(X_normalized)[:, sorted_indices]
rearranged_data = pd.DataFrame(X_transformed, columns=['PCA ' + str(i+1) for i in range(X_transformed.shape[1])])
rearranged_data['Class'] = y

rearranged_data

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,PCA 28,PCA 29,PCA 30,Class
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359,...,0.000241,0.002528,0.011560,0.005773,0.001377,-0.001982,0.001293,0.001989,0.000704,M
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035,...,0.021069,0.001565,0.006968,-0.006978,0.001411,-0.000083,-0.001347,0.000686,-0.001061,M
2,995.793889,39.156743,-1.709753,4.199340,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994,...,-0.002394,-0.004125,-0.004007,0.000709,-0.003781,0.000178,0.000018,-0.000775,0.000405,M
3,-407.180803,-67.380320,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155,...,0.007063,0.001537,0.007003,-0.010261,-0.002899,0.000016,0.001369,-0.002139,-0.001657,M
4,930.341180,189.340742,1.374801,8.499183,7.613289,1.021160,-0.335522,0.289109,0.036087,-0.138502,...,0.010269,0.002204,0.002764,0.002455,0.001665,0.003290,0.000273,0.001783,0.000327,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,110.222492,40.065944,6.562240,-5.102856,-0.395424,-0.786751,0.037082,-0.452530,-0.235185,...,0.007864,-0.002317,-0.002384,-0.003637,-0.008211,0.002418,0.001234,-0.000078,-0.000455,M
565,1045.018854,77.057589,0.036669,-4.753245,-12.417863,-0.059637,0.449831,0.509154,-0.449986,0.493247,...,-0.001905,-0.003028,-0.007931,0.002905,-0.002519,0.000212,0.001006,-0.000621,-0.000741,M
566,314.501756,47.553525,-10.442407,-9.771881,-6.156213,-0.870726,-2.166493,-0.442279,-0.097398,-0.144667,...,-0.002249,-0.001248,-0.003927,-0.000921,0.000573,-0.001325,0.000025,0.000484,-0.000285,M
567,1124.858115,34.129225,-19.742087,-23.660881,3.565133,4.086390,-1.705401,-0.359964,0.385030,0.615467,...,-0.010804,0.005841,0.001127,-0.002646,0.001862,0.002698,0.001235,-0.000809,0.001217,M


In [15]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_pca.csv', index=False)

## RFE

In [16]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


In [17]:
# Load the dataset
data_RFE = pd.read_csv('rearranged_data_pca.csv')
data_RFE

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,PCA 28,PCA 29,PCA 30,Class
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359,...,0.000241,0.002528,0.011560,0.005773,0.001377,-0.001982,0.001293,0.001989,0.000704,M
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035,...,0.021069,0.001565,0.006968,-0.006978,0.001411,-0.000083,-0.001347,0.000686,-0.001061,M
2,995.793889,39.156743,-1.709753,4.199340,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994,...,-0.002394,-0.004125,-0.004007,0.000709,-0.003781,0.000178,0.000018,-0.000775,0.000405,M
3,-407.180803,-67.380320,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155,...,0.007063,0.001537,0.007003,-0.010261,-0.002899,0.000016,0.001369,-0.002139,-0.001657,M
4,930.341180,189.340742,1.374801,8.499183,7.613289,1.021160,-0.335522,0.289109,0.036087,-0.138502,...,0.010269,0.002204,0.002764,0.002455,0.001665,0.003290,0.000273,0.001783,0.000327,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,110.222492,40.065944,6.562240,-5.102856,-0.395424,-0.786751,0.037082,-0.452530,-0.235185,...,0.007864,-0.002317,-0.002384,-0.003637,-0.008211,0.002418,0.001234,-0.000078,-0.000455,M
565,1045.018854,77.057589,0.036669,-4.753245,-12.417863,-0.059637,0.449831,0.509154,-0.449986,0.493247,...,-0.001905,-0.003028,-0.007931,0.002905,-0.002519,0.000212,0.001006,-0.000621,-0.000741,M
566,314.501756,47.553525,-10.442407,-9.771881,-6.156213,-0.870726,-2.166493,-0.442279,-0.097398,-0.144667,...,-0.002249,-0.001248,-0.003927,-0.000921,0.000573,-0.001325,0.000025,0.000484,-0.000285,M
567,1124.858115,34.129225,-19.742087,-23.660881,3.565133,4.086390,-1.705401,-0.359964,0.385030,0.615467,...,-0.010804,0.005841,0.001127,-0.002646,0.001862,0.002698,0.001235,-0.000809,0.001217,M


In [18]:
# Separate the features and class label
X = data_RFE.iloc[:, :-1]
y = data_RFE.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Define the random forest classifier
rf = RandomForestClassifier(n_estimators=100)

# Apply RFE with cross-validation to rank the features
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)
rfe.fit(X_normalized, y)

# Rank the features based on their importance scores
feature_ranks = X.columns[rfe.ranking_ - 1]

# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks})
ranked_features_df.to_csv('feature_ranks_rfe_rf.csv', index=False)

ranked_features_df

Unnamed: 0,Feature
0,PCA 1
1,PCA 3
2,PCA 4
3,PCA 2
4,PCA 8
5,PCA 10
6,PCA 11
7,PCA 14
8,PCA 6
9,PCA 23


In [19]:
# Rearrange the dataset based on the feature ranks
rearranged_data = data_RFE[feature_ranks.tolist() + ['Class']]

# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_rfe_rf.csv', index=False)

In [20]:
rearranged_data

Unnamed: 0,PCA 1,PCA 3,PCA 4,PCA 2,PCA 8,PCA 10,PCA 11,PCA 14,PCA 6,PCA 23,...,PCA 25,PCA 17,PCA 28,PCA 18,PCA 24,PCA 21,PCA 27,PCA 22,PCA 29,Class
0,1160.142574,48.578398,-8.711975,-293.917544,0.148167,0.589359,-0.307804,0.065069,1.265415,0.002528,...,0.005773,0.018300,0.001293,0.010263,0.011560,0.021189,-0.001982,0.000241,0.001989,M
1,1269.122443,-35.394534,17.861283,15.630182,0.200804,-0.084035,0.080642,-0.005534,-0.225872,0.001565,...,-0.006978,0.012371,-0.001347,-0.006009,0.006968,0.005237,-0.000083,0.021069,0.000686,M
2,995.793889,-1.709753,4.199340,39.156743,-0.274026,-0.186994,0.279174,0.024824,-2.652811,-0.004125,...,0.000709,0.008218,0.000018,-0.028044,-0.004007,-0.009865,0.000178,-0.002394,-0.000775,M
3,-407.180803,8.672848,-11.759867,-67.380320,-0.060555,-0.144155,0.927471,0.080057,1.299436,0.001537,...,-0.010261,0.033742,0.001369,-0.016965,0.007003,0.011169,0.000016,0.007063,-0.002139,M
4,930.341180,1.374801,8.499183,189.340742,0.289109,-0.138502,0.042228,0.002274,1.021160,0.002204,...,0.002455,-0.019201,0.000273,0.004024,0.002764,-0.009916,0.003290,0.010269,0.001783,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,40.065944,6.562240,110.222492,0.037082,-0.235185,0.163649,-0.015211,-0.395424,-0.002317,...,-0.003637,-0.004829,0.001234,-0.011515,-0.002384,-0.017214,0.002418,0.007864,-0.000078,M
565,1045.018854,0.036669,-4.753245,77.057589,0.509154,0.493247,0.007625,0.009985,-0.059637,-0.003028,...,0.002905,0.005197,0.001006,0.002106,-0.007931,0.011219,0.000212,-0.001905,-0.000621,M
566,314.501756,-10.442407,-9.771881,47.553525,-0.442279,-0.144667,-0.109147,-0.055285,-0.870726,-0.001248,...,-0.000921,0.007866,0.000025,-0.004484,-0.003927,-0.003362,-0.001325,-0.002249,0.000484,M
567,1124.858115,-19.742087,-23.660881,34.129225,-0.359964,0.615467,0.307166,-0.037742,4.086390,0.005841,...,-0.002646,0.015243,0.001235,0.043651,0.001127,-0.006130,0.002698,-0.010804,-0.000809,M


# Save Data

In [21]:
rearranged_data.to_csv(file_path + 'rearranged_data_rfe_rf.csv', index=False)