# Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import chi2_contingency

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Breast Cancer Classification/Chi-PCA-Rfe/'
data = pd.read_csv(file_path + 'wdbc.csv')
data.columns

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimenstion_worst', 'Class'],
      dtype='object')

In [4]:
data.shape[1]

31

# Feature Selection and Extraction

## Chi-squared

In [5]:
# Separate the features and class label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Check for NaN values in the dataset
# print(np.isnan(X_normalized).sum())
# print(y.isnull().sum())

# Convert the normalized features back to a dataframe
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)
X = X_normalized

# Compute the chi-square scores and p-values for each feature
scores, p_values = [], []
for i in range(X.shape[1]):
    obs = pd.crosstab(X.iloc[:, i], y)
    score, p_value, _, _ = chi2_contingency(obs)
    scores.append(score)
    p_values.append(p_value)


# Rank the features based on their chi-square scores
ranked_features = pd.DataFrame({'Feature': X.columns, 'Score': scores})
sorted_ranked_features = ranked_features.sort_values('Score', ascending=False).reset_index(drop=True)


sorted_ranked_features

Unnamed: 0,Feature,Score
0,concave_points_mean,562.5833
1,smoothness_se,558.305501
2,concavity_mean,558.305501
3,area_worst,558.305501
4,radius_se,557.592534
5,perimeter_se,556.166601
6,concavity_worst,555.453634
7,area_mean,554.027701
8,concavity_se,554.027701
9,perimeter_mean,553.314734


In [6]:
# Save the feature ranks to a CSV file
sorted_ranked_features.to_csv('feature_ranks_chi_square.csv', index=False)

In [7]:
# Showing the dataset
df = pd.read_csv('/content/feature_ranks_chi_square.csv')
df

Unnamed: 0,Feature,Score
0,concave_points_mean,562.5833
1,smoothness_se,558.305501
2,concavity_mean,558.305501
3,area_worst,558.305501
4,radius_se,557.592534
5,perimeter_se,556.166601
6,concavity_worst,555.453634
7,area_mean,554.027701
8,concavity_se,554.027701
9,perimeter_mean,553.314734


In [8]:
# Rearrange the dataset based on the feature ranks
rearranged_data = data[sorted_ranked_features['Feature'].tolist() + ['Class']]

rearranged_data

Unnamed: 0,concave_points_mean,smoothness_se,concavity_mean,area_worst,radius_se,perimeter_se,concavity_worst,area_mean,concavity_se,perimeter_mean,...,radius_mean,concave_points_se,symmetry_worst,texture_mean,fractal_dimension_mean,symmetry_se,smoothness_mean,symmetry_mean,smoothness_worst,Class
0,0.14710,0.006399,0.30010,2019.0,1.0950,8.589,0.7119,1001.0,0.05373,122.80,...,17.99,0.01587,0.4601,10.38,0.07871,0.03003,0.11840,0.2419,0.16220,M
1,0.07017,0.005225,0.08690,1956.0,0.5435,3.398,0.2416,1326.0,0.01860,132.90,...,20.57,0.01340,0.2750,17.77,0.05667,0.01389,0.08474,0.1812,0.12380,M
2,0.12790,0.006150,0.19740,1709.0,0.7456,4.585,0.4504,1203.0,0.03832,130.00,...,19.69,0.02058,0.3613,21.25,0.05999,0.02250,0.10960,0.2069,0.14440,M
3,0.10520,0.009110,0.24140,567.7,0.4956,3.445,0.6869,386.1,0.05661,77.58,...,11.42,0.01867,0.6638,20.38,0.09744,0.05963,0.14250,0.2597,0.20980,M
4,0.10430,0.011490,0.19800,1575.0,0.7572,5.438,0.4000,1297.0,0.05688,135.10,...,20.29,0.01885,0.2364,14.34,0.05883,0.01756,0.10030,0.1809,0.13740,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.13890,0.010300,0.24390,2027.0,1.1760,7.673,0.4107,1479.0,0.05198,142.00,...,21.56,0.02454,0.2060,22.39,0.05623,0.01114,0.11100,0.1726,0.14100,M
565,0.09791,0.005769,0.14400,1731.0,0.7655,5.203,0.3215,1261.0,0.03950,131.20,...,20.13,0.01678,0.2572,28.25,0.05533,0.01898,0.09780,0.1752,0.11660,M
566,0.05302,0.005903,0.09251,1124.0,0.4564,3.425,0.3403,858.1,0.04730,108.30,...,16.60,0.01557,0.2218,28.08,0.05648,0.01318,0.08455,0.1590,0.11390,M
567,0.15200,0.006522,0.35140,1821.0,0.7260,5.772,0.9387,1265.0,0.07117,140.10,...,20.60,0.01664,0.4087,29.33,0.07016,0.02324,0.11780,0.2397,0.16500,M


In [9]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_chi_square.csv', index=False)

## PCA

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [11]:
# Load the dataset
data_PCA = pd.read_csv('rearranged_data_chi_square.csv')
data_PCA

Unnamed: 0,concave_points_mean,smoothness_se,concavity_mean,area_worst,radius_se,perimeter_se,concavity_worst,area_mean,concavity_se,perimeter_mean,...,radius_mean,concave_points_se,symmetry_worst,texture_mean,fractal_dimension_mean,symmetry_se,smoothness_mean,symmetry_mean,smoothness_worst,Class
0,0.14710,0.006399,0.30010,2019.0,1.0950,8.589,0.7119,1001.0,0.05373,122.80,...,17.99,0.01587,0.4601,10.38,0.07871,0.03003,0.11840,0.2419,0.16220,M
1,0.07017,0.005225,0.08690,1956.0,0.5435,3.398,0.2416,1326.0,0.01860,132.90,...,20.57,0.01340,0.2750,17.77,0.05667,0.01389,0.08474,0.1812,0.12380,M
2,0.12790,0.006150,0.19740,1709.0,0.7456,4.585,0.4504,1203.0,0.03832,130.00,...,19.69,0.02058,0.3613,21.25,0.05999,0.02250,0.10960,0.2069,0.14440,M
3,0.10520,0.009110,0.24140,567.7,0.4956,3.445,0.6869,386.1,0.05661,77.58,...,11.42,0.01867,0.6638,20.38,0.09744,0.05963,0.14250,0.2597,0.20980,M
4,0.10430,0.011490,0.19800,1575.0,0.7572,5.438,0.4000,1297.0,0.05688,135.10,...,20.29,0.01885,0.2364,14.34,0.05883,0.01756,0.10030,0.1809,0.13740,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.13890,0.010300,0.24390,2027.0,1.1760,7.673,0.4107,1479.0,0.05198,142.00,...,21.56,0.02454,0.2060,22.39,0.05623,0.01114,0.11100,0.1726,0.14100,M
565,0.09791,0.005769,0.14400,1731.0,0.7655,5.203,0.3215,1261.0,0.03950,131.20,...,20.13,0.01678,0.2572,28.25,0.05533,0.01898,0.09780,0.1752,0.11660,M
566,0.05302,0.005903,0.09251,1124.0,0.4564,3.425,0.3403,858.1,0.04730,108.30,...,16.60,0.01557,0.2218,28.08,0.05648,0.01318,0.08455,0.1590,0.11390,M
567,0.15200,0.006522,0.35140,1821.0,0.7260,5.772,0.9387,1265.0,0.07117,140.10,...,20.60,0.01664,0.4087,29.33,0.07016,0.02324,0.11780,0.2397,0.16500,M


In [12]:
# Separate the features and class label
X = data_PCA.iloc[:, :-1]
y = data_PCA.iloc[:, -1]

# Normalize the features using StandardScaler
scaler = StandardScaler()

X_normalized = scaler.fit_transform(X)
X_normalized = X
X_normalized = X_normalized.fillna(0)

In [13]:
# Apply PCA to the normalized features
pca = PCA()
pca.fit(X_normalized)

In [14]:
# Calculate the variance of all principal components
variance = pca.explained_variance_ratio_

# Rank the principal components based on their variance
sorted_indices = sorted(range(len(variance)), key=lambda i: variance[i], reverse=True)

# Save the variance values of all principal components to a CSV file
variance_df = pd.DataFrame({'Principal Component': range(1, len(variance)+1), 'Variance': variance[sorted_indices]})
variance_df.to_csv('pca_variance.csv', index=False)

variance_df

Unnamed: 0,Principal Component,Variance
0,1,0.9820447
1,2,0.01617649
2,3,0.001557511
3,4,0.000120932
4,5,8.827245e-05
5,6,6.64884e-06
6,7,4.017137e-06
7,8,8.220172e-07
8,9,3.441353e-07
9,10,1.860187e-07


In [15]:
# Rearrange the dataset based on the variance of the principal components
X_transformed = pca.transform(X_normalized)[:, sorted_indices]
rearranged_data = pd.DataFrame(X_transformed, columns=['PCA ' + str(i+1) for i in range(X_transformed.shape[1])])
rearranged_data['Class'] = y

rearranged_data

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,PCA 28,PCA 29,PCA 30,Class
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359,...,0.000241,0.002528,0.011560,0.005773,0.001377,-0.001982,0.001293,0.001989,0.000704,M
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035,...,0.021069,0.001565,0.006968,-0.006978,0.001411,-0.000083,-0.001347,0.000686,-0.001061,M
2,995.793889,39.156743,-1.709753,4.199340,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994,...,-0.002394,-0.004125,-0.004007,0.000709,-0.003781,0.000178,0.000018,-0.000775,0.000405,M
3,-407.180803,-67.380320,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155,...,0.007063,0.001537,0.007003,-0.010261,-0.002899,0.000016,0.001369,-0.002139,-0.001657,M
4,930.341180,189.340742,1.374801,8.499183,7.613289,1.021160,-0.335522,0.289109,0.036087,-0.138502,...,0.010269,0.002204,0.002764,0.002455,0.001665,0.003290,0.000273,0.001783,0.000327,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,110.222492,40.065944,6.562240,-5.102856,-0.395424,-0.786751,0.037082,-0.452530,-0.235185,...,0.007864,-0.002317,-0.002384,-0.003637,-0.008211,0.002418,0.001234,-0.000078,-0.000455,M
565,1045.018854,77.057589,0.036669,-4.753245,-12.417863,-0.059637,0.449831,0.509154,-0.449986,0.493247,...,-0.001905,-0.003028,-0.007931,0.002905,-0.002519,0.000212,0.001006,-0.000621,-0.000741,M
566,314.501756,47.553525,-10.442407,-9.771881,-6.156213,-0.870726,-2.166493,-0.442279,-0.097398,-0.144667,...,-0.002249,-0.001248,-0.003927,-0.000921,0.000573,-0.001325,0.000025,0.000484,-0.000285,M
567,1124.858115,34.129225,-19.742087,-23.660881,3.565133,4.086390,-1.705401,-0.359964,0.385030,0.615467,...,-0.010804,0.005841,0.001127,-0.002646,0.001862,0.002698,0.001235,-0.000809,0.001217,M


In [16]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_pca.csv', index=False)

## RFE

In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


In [18]:
# Load the dataset
data_RFE = pd.read_csv('rearranged_data_pca.csv')
data_RFE

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,PCA 28,PCA 29,PCA 30,Class
0,1160.142574,-293.917544,48.578398,-8.711975,32.000486,1.265415,0.931337,0.148167,0.745463,0.589359,...,0.000241,0.002528,0.011560,0.005773,0.001377,-0.001982,0.001293,0.001989,0.000704,M
1,1269.122443,15.630182,-35.394534,17.861283,-4.334874,-0.225872,-0.046037,0.200804,-0.485828,-0.084035,...,0.021069,0.001565,0.006968,-0.006978,0.001411,-0.000083,-0.001347,0.000686,-0.001061,M
2,995.793889,39.156743,-1.709753,4.199340,-0.466529,-2.652811,-0.779745,-0.274026,-0.173874,-0.186994,...,-0.002394,-0.004125,-0.004007,0.000709,-0.003781,0.000178,0.000018,-0.000775,0.000405,M
3,-407.180803,-67.380320,8.672848,-11.759867,7.115461,1.299436,-1.267304,-0.060555,-0.330639,-0.144155,...,0.007063,0.001537,0.007003,-0.010261,-0.002899,0.000016,0.001369,-0.002139,-0.001657,M
4,930.341180,189.340742,1.374801,8.499183,7.613289,1.021160,-0.335522,0.289109,0.036087,-0.138502,...,0.010269,0.002204,0.002764,0.002455,0.001665,0.003290,0.000273,0.001783,0.000327,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,110.222492,40.065944,6.562240,-5.102856,-0.395424,-0.786751,0.037082,-0.452530,-0.235185,...,0.007864,-0.002317,-0.002384,-0.003637,-0.008211,0.002418,0.001234,-0.000078,-0.000455,M
565,1045.018854,77.057589,0.036669,-4.753245,-12.417863,-0.059637,0.449831,0.509154,-0.449986,0.493247,...,-0.001905,-0.003028,-0.007931,0.002905,-0.002519,0.000212,0.001006,-0.000621,-0.000741,M
566,314.501756,47.553525,-10.442407,-9.771881,-6.156213,-0.870726,-2.166493,-0.442279,-0.097398,-0.144667,...,-0.002249,-0.001248,-0.003927,-0.000921,0.000573,-0.001325,0.000025,0.000484,-0.000285,M
567,1124.858115,34.129225,-19.742087,-23.660881,3.565133,4.086390,-1.705401,-0.359964,0.385030,0.615467,...,-0.010804,0.005841,0.001127,-0.002646,0.001862,0.002698,0.001235,-0.000809,0.001217,M


In [19]:
# Separate the features and class label
X = data_RFE.iloc[:, :-1]
y = data_RFE.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Define the random forest classifier
rf = RandomForestClassifier(n_estimators=100)

# Apply RFE with cross-validation to rank the features
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)
rfe.fit(X_normalized, y)

# Rank the features based on their importance scores
feature_ranks = X.columns[rfe.ranking_ - 1]

# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks})
ranked_features_df.to_csv('feature_ranks_rfe_rf.csv', index=False)

ranked_features_df

Unnamed: 0,Feature
0,PCA 1
1,PCA 3
2,PCA 4
3,PCA 2
4,PCA 8
5,PCA 10
6,PCA 11
7,PCA 12
8,PCA 6
9,PCA 21


In [20]:
# Rearrange the dataset based on the feature ranks
rearranged_data = data_RFE[feature_ranks.tolist() + ['Class']]

# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_rfe_rf.csv', index=False)

In [21]:
rearranged_data

Unnamed: 0,PCA 1,PCA 3,PCA 4,PCA 2,PCA 8,PCA 10,PCA 11,PCA 12,PCA 6,PCA 21,...,PCA 24,PCA 15,PCA 25,PCA 18,PCA 30,PCA 22,PCA 28,PCA 27,PCA 20,Class
0,1160.142574,48.578398,-8.711975,-293.917544,0.148167,0.589359,-0.307804,0.043452,1.265415,0.021189,...,0.011560,-0.012934,0.005773,0.010263,0.000704,0.000241,0.001293,-0.001982,-0.047383,M
1,1269.122443,-35.394534,17.861283,15.630182,0.200804,-0.084035,0.080642,0.033042,-0.225872,0.005237,...,0.006968,0.021368,-0.006978,-0.006009,-0.001061,0.021069,-0.001347,-0.000083,0.020823,M
2,995.793889,-1.709753,4.199340,39.156743,-0.274026,-0.186994,0.279174,-0.020464,-2.652811,-0.009865,...,-0.004007,-0.026887,0.000709,-0.028044,0.000405,-0.002394,0.000018,0.000178,-0.014681,M
3,-407.180803,8.672848,-11.759867,-67.380320,-0.060555,-0.144155,0.927471,-0.174720,1.299436,0.011169,...,0.007003,0.043201,-0.010261,-0.016965,-0.001657,0.007063,0.001369,0.000016,-0.036236,M
4,930.341180,1.374801,8.499183,189.340742,0.289109,-0.138502,0.042228,-0.062721,1.021160,-0.009916,...,0.002764,-0.019548,0.002455,0.004024,0.000327,0.010269,0.000273,0.003290,0.010930,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1414.126684,40.065944,6.562240,110.222492,0.037082,-0.235185,0.163649,0.052543,-0.395424,-0.017214,...,-0.002384,-0.061390,-0.003637,-0.011515,-0.000455,0.007864,0.001234,0.002418,-0.024062,M
565,1045.018854,0.036669,-4.753245,77.057589,0.509154,0.493247,0.007625,0.055832,-0.059637,0.011219,...,-0.007931,0.003312,0.002905,0.002106,-0.000741,-0.001905,0.001006,0.000212,-0.002071,M
566,314.501756,-10.442407,-9.771881,47.553525,-0.442279,-0.144667,-0.109147,0.076263,-0.870726,-0.003362,...,-0.003927,-0.012459,-0.000921,-0.004484,-0.000285,-0.002249,0.000025,-0.001325,0.013697,M
567,1124.858115,-19.742087,-23.660881,34.129225,-0.359964,0.615467,0.307166,-0.028224,4.086390,-0.006130,...,0.001127,-0.031873,-0.002646,0.043651,0.001217,-0.010804,0.001235,0.002698,-0.041891,M


# Save Data

In [22]:
rearranged_data.to_csv(file_path + 'rearranged_data_rfe_rf.csv', index=False)