Preprocessing golub and prostate data

In [30]:
import pandas as pd
import numpy as np

In [10]:
def remove_collinear_features(df, threshold=0.9):
    """
    Remove highly collinear features - correlation above the given threshold.
    Returns the modified DataFrame.
    """
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

    df.drop(columns=collinear_features, inplace=True)
    df = df.copy()  # Defragment DataFrame to improve performance

    print(f"Number of features after removing collinearity: {df.shape[1] - 1}")

    return df

# Prostate

In [8]:
df = pd.read_csv('/content/prostate_dataF.csv')

num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

df = df.rename(columns={'class_index': 'target'})

df.head()

Number of rows: 102
Number of columns: 6034


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V6025,V6026,V6027,V6028,V6029,V6030,V6031,V6032,V6033,target
0,-0.927178,-0.740039,-0.532016,-1.097891,-0.986673,0.018752,-0.863326,3.959997,-0.398219,-1.097891,...,-1.097891,-1.097891,-0.894733,0.85027,-1.097891,-0.302063,-0.767685,-0.452616,0.194001,0
1,-0.835899,-0.835899,-0.585647,-0.835899,-0.329768,-0.835899,-0.835899,4.037788,-0.835899,-0.835899,...,-0.835899,-0.835899,-0.835899,0.03809,-0.835899,-0.835899,-0.835899,-0.835899,0.0755,0
2,0.236073,0.252645,-1.154351,-0.372372,-0.3389,1.253467,0.506325,3.868961,0.201932,-1.154351,...,-1.154351,-1.154351,-0.372372,0.899781,-0.8672,0.360426,-1.154351,-1.154351,-1.154351,0
3,-0.748623,-0.439165,0.790953,-1.033876,0.241115,1.216007,0.106282,3.976272,0.081755,-1.121534,...,-1.121534,-1.121534,-0.812076,1.230378,-0.355497,0.030647,-0.27881,-1.121534,-1.121534,0
4,0.101239,-0.298285,-1.121519,-0.957714,0.342258,1.182937,0.029325,4.575871,-0.196466,-1.121519,...,-1.121519,-1.121519,-0.454932,0.891627,-0.593427,0.306316,-0.076496,-0.13448,-1.121519,0


In [11]:
df_new = remove_collinear_features(df)

Number of features after removing collinearity: 4870


In [12]:
df_new.head()
num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 102
Number of columns: 4871


In [13]:
df_new.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V6022,V6024,V6026,V6027,V6029,V6030,V6031,V6032,V6033,target
0,-0.927178,-0.740039,-0.532016,-1.097891,-0.986673,0.018752,-0.863326,3.959997,-0.398219,-1.097891,...,-0.810672,-0.029602,-1.097891,-0.894733,-1.097891,-0.302063,-0.767685,-0.452616,0.194001,0
1,-0.835899,-0.835899,-0.585647,-0.835899,-0.329768,-0.835899,-0.835899,4.037788,-0.835899,-0.835899,...,-0.835899,-0.835899,-0.835899,-0.835899,-0.835899,-0.835899,-0.835899,-0.835899,0.0755,0
2,0.236073,0.252645,-1.154351,-0.372372,-0.3389,1.253467,0.506325,3.868961,0.201932,-1.154351,...,0.388876,0.860446,-1.154351,-0.372372,-0.8672,0.360426,-1.154351,-1.154351,-1.154351,0
3,-0.748623,-0.439165,0.790953,-1.033876,0.241115,1.216007,0.106282,3.976272,0.081755,-1.121534,...,-0.484038,0.614026,-1.121534,-0.812076,-0.355497,0.030647,-0.27881,-1.121534,-1.121534,0
4,0.101239,-0.298285,-1.121519,-0.957714,0.342258,1.182937,0.029325,4.575871,-0.196466,-1.121519,...,0.517731,0.639511,-1.121519,-0.454932,-0.593427,0.306316,-0.076496,-0.13448,-1.121519,0


In [16]:
df_new.to_csv('prostate_data_no_colinear1.csv', index=False)

# Golub

In [22]:
df = pd.read_csv('/content/golub_data_with_target2.csv')

num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

df.head()

Number of rows: 38
Number of columns: 3052


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V3043,V3044,V3045,V3046,V3047,V3048,V3049,V3050,V3051,target
0,-1.45769,-0.75161,0.45695,3.13533,2.76569,2.64342,3.16885,2.8886,3.22372,3.22372,...,-0.69342,-0.30476,-0.21661,1.08935,0.04695,-0.20467,0.45231,-0.3592,-0.86079,ALL
1,-1.3942,-1.26278,-0.09654,0.21415,-1.27045,1.01416,3.09954,2.95355,3.09954,3.09954,...,-0.80743,-0.72056,-0.65287,0.22701,0.48704,-0.07832,0.42686,-0.43633,-1.3942,ALL
2,-1.42779,-0.09052,0.90325,2.08754,1.60433,1.70477,2.99977,2.99977,2.99977,2.99977,...,-0.51414,-0.11296,0.27332,0.31272,0.7217,-1.00615,0.67579,0.34031,-0.73766,ALL
3,-1.40715,-0.99596,-0.07194,2.23467,1.53182,1.63845,3.28898,3.03972,3.34097,3.35455,...,-1.17554,-0.25346,-0.35475,0.4745,0.58403,-0.88748,0.31524,-0.9093,-1.19031,ALL
4,-1.42668,-1.24245,0.03232,0.93811,1.63728,-0.36075,3.19368,3.21721,3.27515,3.27515,...,-1.42668,-0.99706,-0.89248,0.27257,0.306,0.07175,-0.57779,-0.36663,-1.42668,ALL


In [24]:
df = pd.read_csv('/content/golub_data_with_target2.csv')
df['target'] = df['target'].replace({'ALL': 0, 'AML': 1})
df.head()

  df['target'] = df['target'].replace({'ALL': 0, 'AML': 1})


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V3043,V3044,V3045,V3046,V3047,V3048,V3049,V3050,V3051,target
0,-1.45769,-0.75161,0.45695,3.13533,2.76569,2.64342,3.16885,2.8886,3.22372,3.22372,...,-0.69342,-0.30476,-0.21661,1.08935,0.04695,-0.20467,0.45231,-0.3592,-0.86079,0
1,-1.3942,-1.26278,-0.09654,0.21415,-1.27045,1.01416,3.09954,2.95355,3.09954,3.09954,...,-0.80743,-0.72056,-0.65287,0.22701,0.48704,-0.07832,0.42686,-0.43633,-1.3942,0
2,-1.42779,-0.09052,0.90325,2.08754,1.60433,1.70477,2.99977,2.99977,2.99977,2.99977,...,-0.51414,-0.11296,0.27332,0.31272,0.7217,-1.00615,0.67579,0.34031,-0.73766,0
3,-1.40715,-0.99596,-0.07194,2.23467,1.53182,1.63845,3.28898,3.03972,3.34097,3.35455,...,-1.17554,-0.25346,-0.35475,0.4745,0.58403,-0.88748,0.31524,-0.9093,-1.19031,0
4,-1.42668,-1.24245,0.03232,0.93811,1.63728,-0.36075,3.19368,3.21721,3.27515,3.27515,...,-1.42668,-0.99706,-0.89248,0.27257,0.306,0.07175,-0.57779,-0.36663,-1.42668,0


In [25]:
df_new = remove_collinear_features(df)

Number of features after removing collinearity: 2982


In [26]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V7,V11,V12,V13,V15,V16,...,V3042,V3043,V3044,V3045,V3046,V3047,V3048,V3049,V3050,target
0,-1.45769,-0.75161,0.45695,3.13533,3.16885,-0.56223,-1.45769,-0.45048,2.40116,0.80633,...,-0.31273,-0.69342,-0.30476,-0.21661,1.08935,0.04695,-0.20467,0.45231,-0.3592,0
1,-1.3942,-1.26278,-0.09654,0.21415,3.09954,0.05358,-1.01765,-0.46687,1.83222,0.26994,...,-0.45147,-0.80743,-0.72056,-0.65287,0.22701,0.48704,-0.07832,0.42686,-0.43633,0
2,-1.42779,-0.09052,0.90325,2.08754,2.99977,0.12612,-1.46227,-0.91195,1.62478,0.49549,...,-0.76907,-0.51414,-0.11296,0.27332,0.31272,0.7217,-1.00615,0.67579,0.34031,0
3,-1.40715,-0.99596,-0.07194,2.23467,3.28898,-0.84016,-1.40715,-0.51066,1.59089,0.15222,...,-0.03863,-1.17554,-0.25346,-0.35475,0.4745,0.58403,-0.88748,0.31524,-0.9093,0
4,-1.42668,-1.24245,0.03232,0.93811,3.19368,-0.4371,-1.42668,-0.44992,0.757,-0.03737,...,-0.99706,-1.42668,-0.99706,-0.89248,0.27257,0.306,0.07175,-0.57779,-0.36663,0


In [27]:
df.to_csv('golub_data_no_colinear.csv', index=False)