# Import Data

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import kruskal

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Breast Cancer Classification/Kruskal-PCA-Rfe/'
data = pd.read_csv(file_path + 'wdbc.csv')
data.columns

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimenstion_worst', 'Class'],
      dtype='object')

In [4]:
data.shape[1]

31

# Feature Selection and Extraction

## Kruskal

In [5]:
# Separate the features and class label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)


# Calculate the Kruskal-Wallis H-test p-values for each feature
p_values = [kruskal(X_normalized[y == k, i], X_normalized[y != k, i]).pvalue for i in range(X_normalized.shape[1]) for k in set(y)]

# Calculate the average p-value for each feature
avg_p_values = [sum(p_values[i:i+len(set(y))]) / len(set(y)) for i in range(0, len(p_values), len(set(y)))]


# Rank the features based on their average p-values
sorted_indices = sorted(range(len(avg_p_values)), key=lambda i: avg_p_values[i])
feature_ranks = X.columns[sorted_indices]


feature_ranks

Index(['perimeter_worst', 'radius_worst', 'area_worst', 'concave_points_worst',
       'concave_points_mean', 'perimeter_mean', 'area_mean', 'concavity_mean',
       'radius_mean', 'area_se', 'concavity_worst', 'perimeter_se',
       'radius_se', 'compactness_mean', 'compactness_worst',
       'concave_points_se', 'texture_worst', 'concavity_se', 'texture_mean',
       'smoothness_worst', 'symmetry_worst', 'compactness_se',
       'smoothness_mean', 'symmetry_mean', 'fractal_dimenstion_worst',
       'fractal_dimension_se', 'symmetry_se', 'smoothness_se',
       'fractal_dimension_mean', 'texture_se'],
      dtype='object')

In [6]:
# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks})
ranked_features_df.to_csv('feature_ranks_kruskal_wallis.csv', index=False)

# Rearrange the dataset based on the feature ranks
rearranged_data = data[feature_ranks.tolist() + ['Class']]

# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_kruskal_wallis.csv', index=False)

In [7]:
ranked_features_df

Unnamed: 0,Feature
0,perimeter_worst
1,radius_worst
2,area_worst
3,concave_points_worst
4,concave_points_mean
5,perimeter_mean
6,area_mean
7,concavity_mean
8,radius_mean
9,area_se


In [8]:
# Showing the dataset
df = pd.read_csv('/content/rearranged_data_kruskal_wallis.csv')
df

Unnamed: 0,perimeter_worst,radius_worst,area_worst,concave_points_worst,concave_points_mean,perimeter_mean,area_mean,concavity_mean,radius_mean,area_se,...,compactness_se,smoothness_mean,symmetry_mean,fractal_dimenstion_worst,fractal_dimension_se,symmetry_se,smoothness_se,fractal_dimension_mean,texture_se,Class
0,184.60,25.380,2019.0,0.2654,0.14710,122.80,1001.0,0.30010,17.99,153.40,...,0.04904,0.11840,0.2419,0.11890,0.006193,0.03003,0.006399,0.07871,0.9053,M
1,158.80,24.990,1956.0,0.1860,0.07017,132.90,1326.0,0.08690,20.57,74.08,...,0.01308,0.08474,0.1812,0.08902,0.003532,0.01389,0.005225,0.05667,0.7339,M
2,152.50,23.570,1709.0,0.2430,0.12790,130.00,1203.0,0.19740,19.69,94.03,...,0.04006,0.10960,0.2069,0.08758,0.004571,0.02250,0.006150,0.05999,0.7869,M
3,98.87,14.910,567.7,0.2575,0.10520,77.58,386.1,0.24140,11.42,27.23,...,0.07458,0.14250,0.2597,0.17300,0.009208,0.05963,0.009110,0.09744,1.1560,M
4,152.20,22.540,1575.0,0.1625,0.10430,135.10,1297.0,0.19800,20.29,94.44,...,0.02461,0.10030,0.1809,0.07678,0.005115,0.01756,0.011490,0.05883,0.7813,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,166.10,25.450,2027.0,0.2216,0.13890,142.00,1479.0,0.24390,21.56,158.70,...,0.02891,0.11100,0.1726,0.07115,0.004239,0.01114,0.010300,0.05623,1.2560,M
565,155.00,23.690,1731.0,0.1628,0.09791,131.20,1261.0,0.14400,20.13,99.04,...,0.02423,0.09780,0.1752,0.06637,0.002498,0.01898,0.005769,0.05533,2.4630,M
566,126.70,18.980,1124.0,0.1418,0.05302,108.30,858.1,0.09251,16.60,48.55,...,0.03731,0.08455,0.1590,0.07820,0.003892,0.01318,0.005903,0.05648,1.0750,M
567,184.60,25.740,1821.0,0.2650,0.15200,140.10,1265.0,0.35140,20.60,86.22,...,0.06158,0.11780,0.2397,0.12400,0.006185,0.02324,0.006522,0.07016,1.5950,M


In [9]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_kruskal_wallis.csv', index=False)

## PCA

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [11]:
# Load the dataset
data_PCA = pd.read_csv('rearranged_data_kruskal_wallis.csv')
data_PCA

Unnamed: 0,perimeter_worst,radius_worst,area_worst,concave_points_worst,concave_points_mean,perimeter_mean,area_mean,concavity_mean,radius_mean,area_se,...,compactness_se,smoothness_mean,symmetry_mean,fractal_dimenstion_worst,fractal_dimension_se,symmetry_se,smoothness_se,fractal_dimension_mean,texture_se,Class
0,184.60,25.380,2019.0,0.2654,0.14710,122.80,1001.0,0.30010,17.99,153.40,...,0.04904,0.11840,0.2419,0.11890,0.006193,0.03003,0.006399,0.07871,0.9053,M
1,158.80,24.990,1956.0,0.1860,0.07017,132.90,1326.0,0.08690,20.57,74.08,...,0.01308,0.08474,0.1812,0.08902,0.003532,0.01389,0.005225,0.05667,0.7339,M
2,152.50,23.570,1709.0,0.2430,0.12790,130.00,1203.0,0.19740,19.69,94.03,...,0.04006,0.10960,0.2069,0.08758,0.004571,0.02250,0.006150,0.05999,0.7869,M
3,98.87,14.910,567.7,0.2575,0.10520,77.58,386.1,0.24140,11.42,27.23,...,0.07458,0.14250,0.2597,0.17300,0.009208,0.05963,0.009110,0.09744,1.1560,M
4,152.20,22.540,1575.0,0.1625,0.10430,135.10,1297.0,0.19800,20.29,94.44,...,0.02461,0.10030,0.1809,0.07678,0.005115,0.01756,0.011490,0.05883,0.7813,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,166.10,25.450,2027.0,0.2216,0.13890,142.00,1479.0,0.24390,21.56,158.70,...,0.02891,0.11100,0.1726,0.07115,0.004239,0.01114,0.010300,0.05623,1.2560,M
565,155.00,23.690,1731.0,0.1628,0.09791,131.20,1261.0,0.14400,20.13,99.04,...,0.02423,0.09780,0.1752,0.06637,0.002498,0.01898,0.005769,0.05533,2.4630,M
566,126.70,18.980,1124.0,0.1418,0.05302,108.30,858.1,0.09251,16.60,48.55,...,0.03731,0.08455,0.1590,0.07820,0.003892,0.01318,0.005903,0.05648,1.0750,M
567,184.60,25.740,1821.0,0.2650,0.15200,140.10,1265.0,0.35140,20.60,86.22,...,0.06158,0.11780,0.2397,0.12400,0.006185,0.02324,0.006522,0.07016,1.5950,M


In [12]:
# Separate the features and class label
X = data_PCA.iloc[:, :-1]
y = data_PCA.iloc[:, -1]

# Normalize the features using StandardScaler
scaler = StandardScaler()

X_normalized = scaler.fit_transform(X)
X_normalized = X
X_normalized = X_normalized.fillna(0)

In [13]:
# Apply PCA to the normalized features
pca = PCA()
pca.fit(X_normalized)

In [14]:
# Calculate the variance of all principal components
variance = pca.explained_variance_ratio_

# Rank the principal components based on their variance
sorted_indices = sorted(range(len(variance)), key=lambda i: variance[i], reverse=True)

# Save the variance values of all principal components to a CSV file
variance_df = pd.DataFrame({'Principal Component': range(1, len(variance)+1), 'Variance': variance[sorted_indices]})
variance_df.to_csv('pca_variance.csv', index=False)

variance_df

Unnamed: 0,Principal Component,Variance
0,1,0.9820447
1,2,0.01617649
2,3,0.001557511
3,4,0.000120932
4,5,8.827245e-05
5,6,6.64884e-06
6,7,4.017137e-06
7,8,8.220172e-07
8,9,3.441353e-07
9,10,1.860187e-07


In [15]:
# Rearrange the dataset based on the variance of the principal components
X_transformed = pca.transform(X_normalized)[:, sorted_indices]
rearranged_data = pd.DataFrame(X_transformed, columns=['PCA ' + str(i+1) for i in range(X_transformed.shape[1])])
rearranged_data['Class'] = y

rearranged_data

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,PCA 28,PCA 29,PCA 30,Class
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359,...,0.000241,0.002528,0.011560,0.005773,0.001377,-0.001982,0.001293,0.001989,0.000704,M
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035,...,0.021069,0.001565,0.006968,-0.006978,0.001411,-0.000083,-0.001347,0.000686,-0.001061,M
2,995.793889,39.156743,-1.709753,4.199340,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994,...,-0.002394,-0.004125,-0.004007,0.000709,-0.003781,0.000178,0.000018,-0.000775,0.000405,M
3,-407.180803,-67.380320,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155,...,0.007063,0.001537,0.007003,-0.010261,-0.002899,0.000016,0.001369,-0.002139,-0.001657,M
4,930.341180,189.340742,1.374801,8.499183,7.613289,1.021160,-0.335522,0.289109,0.036087,-0.138502,...,0.010269,0.002204,0.002764,0.002455,0.001665,0.003290,0.000273,0.001783,0.000327,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,110.222492,40.065944,6.562240,-5.102856,-0.395424,-0.786751,0.037082,-0.452530,-0.235185,...,0.007864,-0.002317,-0.002384,-0.003637,-0.008211,0.002418,0.001234,-0.000078,-0.000455,M
565,1045.018854,77.057589,0.036669,-4.753245,-12.417863,-0.059637,0.449831,0.509154,-0.449986,0.493247,...,-0.001905,-0.003028,-0.007931,0.002905,-0.002519,0.000212,0.001006,-0.000621,-0.000741,M
566,314.501756,47.553525,-10.442407,-9.771881,-6.156213,-0.870726,-2.166493,-0.442279,-0.097398,-0.144667,...,-0.002249,-0.001248,-0.003927,-0.000921,0.000573,-0.001325,0.000025,0.000484,-0.000285,M
567,1124.858115,34.129225,-19.742087,-23.660881,3.565133,4.086390,-1.705401,-0.359964,0.385030,0.615467,...,-0.010804,0.005841,0.001127,-0.002646,0.001862,0.002698,0.001235,-0.000809,0.001217,M


In [16]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_pca.csv', index=False)

## RFE

In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


In [18]:
# Load the dataset
data_RFE = pd.read_csv('rearranged_data_pca.csv')
data_RFE

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,PCA 28,PCA 29,PCA 30,Class
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359,...,0.000241,0.002528,0.011560,0.005773,0.001377,-0.001982,0.001293,0.001989,0.000704,M
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035,...,0.021069,0.001565,0.006968,-0.006978,0.001411,-0.000083,-0.001347,0.000686,-0.001061,M
2,995.793889,39.156743,-1.709753,4.199340,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994,...,-0.002394,-0.004125,-0.004007,0.000709,-0.003781,0.000178,0.000018,-0.000775,0.000405,M
3,-407.180803,-67.380320,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155,...,0.007063,0.001537,0.007003,-0.010261,-0.002899,0.000016,0.001369,-0.002139,-0.001657,M
4,930.341180,189.340742,1.374801,8.499183,7.613289,1.021160,-0.335522,0.289109,0.036087,-0.138502,...,0.010269,0.002204,0.002764,0.002455,0.001665,0.003290,0.000273,0.001783,0.000327,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,110.222492,40.065944,6.562240,-5.102856,-0.395424,-0.786751,0.037082,-0.452530,-0.235185,...,0.007864,-0.002317,-0.002384,-0.003637,-0.008211,0.002418,0.001234,-0.000078,-0.000455,M
565,1045.018854,77.057589,0.036669,-4.753245,-12.417863,-0.059637,0.449831,0.509154,-0.449986,0.493247,...,-0.001905,-0.003028,-0.007931,0.002905,-0.002519,0.000212,0.001006,-0.000621,-0.000741,M
566,314.501756,47.553525,-10.442407,-9.771881,-6.156213,-0.870726,-2.166493,-0.442279,-0.097398,-0.144667,...,-0.002249,-0.001248,-0.003927,-0.000921,0.000573,-0.001325,0.000025,0.000484,-0.000285,M
567,1124.858115,34.129225,-19.742087,-23.660881,3.565133,4.086390,-1.705401,-0.359964,0.385030,0.615467,...,-0.010804,0.005841,0.001127,-0.002646,0.001862,0.002698,0.001235,-0.000809,0.001217,M


In [19]:
# Separate the features and class label
X = data_RFE.iloc[:, :-1]
y = data_RFE.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Define the random forest classifier
rf = RandomForestClassifier(n_estimators=100)

# Apply RFE with cross-validation to rank the features
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)
rfe.fit(X_normalized, y)

# Rank the features based on their importance scores
feature_ranks = X.columns[rfe.ranking_ - 1]

# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks})
ranked_features_df.to_csv('feature_ranks_rfe_rf.csv', index=False)

ranked_features_df

Unnamed: 0,Feature
0,PCA 1
1,PCA 3
2,PCA 4
3,PCA 2
4,PCA 8
5,PCA 9
6,PCA 11
7,PCA 12
8,PCA 6
9,PCA 21


In [20]:
# Rearrange the dataset based on the feature ranks
rearranged_data = data_RFE[feature_ranks.tolist() + ['Class']]

# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_rfe_rf.csv', index=False)

In [21]:
rearranged_data

Unnamed: 0,PCA 1,PCA 3,PCA 4,PCA 2,PCA 8,PCA 9,PCA 11,PCA 12,PCA 6,PCA 21,...,PCA 28,PCA 19,PCA 25,PCA 22,PCA 29,PCA 20,PCA 27,PCA 23,PCA 24,Class
0,1160.142574,48.578398,-8.711975,-293.917544,0.148167,0.745463,-0.307804,0.043452,1.265415,0.021189,...,0.001293,-0.009363,0.005773,0.000241,0.001989,-0.047383,-0.001982,0.002528,0.011560,M
1,1269.122443,-35.394534,17.861283,15.630182,0.200804,-0.485828,0.080642,0.033042,-0.225872,0.005237,...,-0.001347,0.016707,-0.006978,0.021069,0.000686,0.020823,-0.000083,0.001565,0.006968,M
2,995.793889,-1.709753,4.199340,39.156743,-0.274026,-0.173874,0.279174,-0.020464,-2.652811,-0.009865,...,0.000018,0.004857,0.000709,-0.002394,-0.000775,-0.014681,0.000178,-0.004125,-0.004007,M
3,-407.180803,8.672848,-11.759867,-67.380320,-0.060555,-0.330639,0.927471,-0.174720,1.299436,0.011169,...,0.001369,-0.046247,-0.010261,0.007063,-0.002139,-0.036236,0.000016,0.001537,0.007003,M
4,930.341180,1.374801,8.499183,189.340742,0.289109,0.036087,0.042228,-0.062721,1.021160,-0.009916,...,0.000273,-0.004134,0.002455,0.010269,0.001783,0.010930,0.003290,0.002204,0.002764,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,40.065944,6.562240,110.222492,0.037082,-0.452530,0.163649,0.052543,-0.395424,-0.017214,...,0.001234,0.003392,-0.003637,0.007864,-0.000078,-0.024062,0.002418,-0.002317,-0.002384,M
565,1045.018854,0.036669,-4.753245,77.057589,0.509154,-0.449986,0.007625,0.055832,-0.059637,0.011219,...,0.001006,0.003541,0.002905,-0.001905,-0.000621,-0.002071,0.000212,-0.003028,-0.007931,M
566,314.501756,-10.442407,-9.771881,47.553525,-0.442279,-0.097398,-0.109147,0.076263,-0.870726,-0.003362,...,0.000025,0.007275,-0.000921,-0.002249,0.000484,0.013697,-0.001325,-0.001248,-0.003927,M
567,1124.858115,-19.742087,-23.660881,34.129225,-0.359964,0.385030,0.307166,-0.028224,4.086390,-0.006130,...,0.001235,0.012420,-0.002646,-0.010804,-0.000809,-0.041891,0.002698,0.005841,0.001127,M


# Save Data

In [22]:
rearranged_data.to_csv(file_path + 'rearranged_data_rfe_rf.csv', index=False)