## Feature Selection

In [4]:
import pandas as pd

# loading the dataset
dataFrame: pd.DataFrame = pd.read_csv('./DatasetParser/Dataset/ProcessedFiles/merged_data.csv', low_memory=False)

# Retrieving all the unique cases from the dataset
dataFrame = dataFrame.drop_duplicates(subset=['case_id'])

# Selecting all the unstranded and tpm_unstranded columns
geneColumns = [col for col in dataFrame.columns if 'unstranded' in col]

# Checking rows for NaN values in specific columns
selectedColumns = ['case_id', 'histological_type', 'icd_o_3_histology'] + geneColumns

# Keeping only the selected columns
dataFrame = dataFrame[selectedColumns]

# Dropping rows with any NaN values
dataFrame = dataFrame.dropna()

# Identifying columns with mixed data types
mixed_type_columns = [
    col for col in dataFrame.columns
    if dataFrame[col].map(type).nunique() > 1
]

# Dropping columns with mixed data types
dataFrame.drop(columns=mixed_type_columns, inplace=True)


In [5]:
from sklearn.feature_selection import VarianceThreshold

# Step 1: Select only the feature columns (ensure no NaNs)
threshHoldDataFrame: pd.DataFrame = dataFrame[geneColumns].copy()

# Step 2: Apply VarianceThreshold
selector = VarianceThreshold(threshold=10) # TODO find logical value
X_selected = selector.fit_transform(threshHoldDataFrame)

# Step 3: Get selected column names
selectedColumns: list[str] = threshHoldDataFrame.columns[selector.get_support()]

# Step 4: Create a new DataFrame with selected features
filterdDataFrame = pd.DataFrame(X_selected, columns=selectedColumns, index=dataFrame.index)

# Step 5: Drop original gene columns and add the reduced set
dataFrame = dataFrame.drop(columns=geneColumns).join(filterdDataFrame)

# Updating the geneColumns to reflect the reduced set
geneColumns = selectedColumns.tolist()

print(dataFrame)


                                   case_id  \
0     b40d0849-f4ef-4a14-9732-9beb708cb46b   
1     0d66bf6c-eed0-4726-bd5b-3bf6d610b4e0   
2     193201a3-1447-47b1-bdf1-11ae0eb3b2f3   
3     21fb46f9-4bbb-441c-af19-a687e9138344   
5     4d51ee44-e6f4-4bcb-be28-e9df54b39a8d   
...                                    ...   
1157  ad98977b-e159-410a-b8c2-f4e8a07f9784   
1158  3f1b4356-0b53-48cd-8938-96fc786c9b63   
1159  a9d7adec-3849-40e2-a2a2-f43443ec43bb   
1160  8be10ce9-220d-4816-a575-8e8e7f041114   
1161  d49b0369-905c-4608-96a9-cc854980fc4c   

                                      histological_type icd_o_3_histology  \
0     Lung Adenocarcinoma- Not Otherwise Specified (...            8140/3   
1     Lung Adenocarcinoma- Not Otherwise Specified (...            8140/3   
2     Lung Adenocarcinoma- Not Otherwise Specified (...            8140/3   
3                          Mucinous (Colloid) Carcinoma            8480/3   
5     Lung Adenocarcinoma- Not Otherwise Specified (...       

In [6]:
# creating a correlation matrix
correlationMatrix = dataFrame[geneColumns].corr()

threshold = 0.5

# Finding pairs of columns with correlation above the threshold
selectedColumns = set()
for i in range(len(correlationMatrix.columns)):
    for j in range(i):
        if abs(correlationMatrix.iloc[i, j]) > threshold:
            colname = correlationMatrix.columns[i]
            selectedColumns.add(colname)

# Converting the set to a list
selectedColumns = list(selectedColumns)
# Printing the selected columns that are highly correlated
print(f"Columns with correlation above {threshold}: {selectedColumns}")

# Dropping the selected columns from the original DataFrame
dataFrame = dataFrame.drop(columns=selectedColumns)

print(dataFrame)

Columns with correlation above 0.5: ['AL442067.1_unstranded', 'AC244197.2_unstranded', 'SNHG6_unstranded', 'AC117464.1_unstranded', 'AC148477.3_unstranded', 'AL133477.1_unstranded', 'AC253576.2_unstranded', 'AL031775.1_unstranded', 'AC024060.2_tpm_unstranded', 'AC025810.1_unstranded', 'AC009264.1_unstranded', 'MRGPRF-AS1_unstranded', 'AC105046.2_unstranded', 'AL592424.1_unstranded', 'VCAN-AS1_unstranded', 'AC009159.3_unstranded', 'EIF1AX-AS1_unstranded', 'ARHGEF38-IT1_unstranded', 'AC131956.2_unstranded', 'AC012236.1_tpm_unstranded', 'AC025430.1_unstranded', 'PP7080_unstranded', 'LINC00472_unstranded', 'AL391840.3_unstranded', 'AP000851.1_unstranded', 'AC012500.1_unstranded', 'AC008629.1_unstranded', 'AL731533.3_unstranded', 'AC012494.2_unstranded', 'BZW1-AS1_unstranded', 'RFX5-AS1_unstranded', 'AC087854.1_unstranded', 'AL031775.2_unstranded', 'AC015871.6_unstranded', 'MALAT1_tpm_unstranded', 'AL135818.3_unstranded', 'AC009271.2_unstranded', 'AP003170.3_unstranded', 'AC138305.1_unstran