In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
##################################

# BASIC PREPROCESSING

##################################

In [27]:
# data cleaning
def dataClean(df, set = 1):
  numNANs = df.isna().sum()
  print("Number of NANs in data set", set, ":", numNANs)
  print("Preparing to interpolate NANs...")
  df = df.interpolate(limit_direction = "both")
  print("Number of NANs now: ", df.isna().sum())
  try:
    if set == 1:
      # NOTE: the extra column only exists in the first dataset ("data.csv")
      # there's an extra column at the end of the data set that needs removing.
      df = df.drop(columns=["id", "Unnamed: 32"])
      # other than this, there is no more data cleaning to do.
  except: pass;

  return df

In [28]:
# First we isolate the classification column of the datasset ('diagnosis').
# Because 'diagnosis' uses M & B as labels and CFS uses numbers, we map
# M (or 1') and B (or -1') to 1 and 0, respectively.
def predClassMapping(df, set = 1):
  if set == 1: #data set 1
    df['diagnosis'] = pd.Series(df.diagnosis).map({'M':1,'B':0});
  elif set == 2:
    df["diagnosis"] = pd.Series(df.diagnosis).map({"1'": 1, "-1'": 0})
  return df

In [29]:
##################################

# FEATURE SELECTION

##################################

In [30]:
#12/24
# CORRELATION FEATURE SELECTION
R_XY = None
# tauRed = 0.8, k = 6
def corrFeatureSelection(df, k = 10, tauRedundancy = 0.8):
  # 1) Sort features by absolute correlation with the label (descending)
  targetCorr = df.corr()['diagnosis'].abs().sort_values(ascending=False)
  R_XY = targetCorr
  # 2) Now pick features one by one from the most strongly correlated
  #    to the least, but skip any feature that is "too correlated"

  selectedFeatures = []
  rejectedFeatures = []

  for feature in targetCorr.index:
    if feature == 'diagnosis':
      continue  # Skip the label itself

    # Check correlation with already selected features
    aboveThreshold = False
    for alreadySelected in selectedFeatures:
      # If the correlation is above the threshold, skip
      if abs(df[feature].corr(df[alreadySelected])) > tauRedundancy:
        aboveThreshold = True
        rejectedFeatures.append(feature)
        break

    if not aboveThreshold:
      selectedFeatures.append(feature)

    # If we already have our 10 features, stop
    if len(selectedFeatures) == k:
      break

  print("Selected features:", selectedFeatures)
  print("Num features:", len(selectedFeatures))
  # This will give up to 10 features that are:
  # - highly correlated with the label (because we started with that sorted list),
  # - but have low correlation with each other (due to our threshold check).

  selectedDF = df[['diagnosis'] + selectedFeatures]

  return { "sfCorrMatrix": selectedDF.corr(), "selectedDF": selectedDF,"selectedFeatures": selectedFeatures, "rejectedFeatures": rejectedFeatures }

In [31]:
"""
# RELEVANCY GRAPH

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Load and process data
df = pd.read_csv("data2.csv")
df = dataClean(df, set = 2)
df = predClassMapping(df, set = 2)

# Compute correlations and perform CFS
R_XY = df.corr()['diagnosis'].abs().sort_values(ascending=False)
# Remove the 'diagnosis' entry from the correlation Series
R_XY = R_XY.drop('diagnosis')

res = corrFeatureSelection(df, k=6, tauRedundancy=0.8)

# Extract features and their correlation values (without 'diagnosis')
features = R_XY.index
correlations = R_XY.values

# Create a color list for each feature based on its status:
colors = []
for feature in features:
    if feature in res["selectedFeatures"]:
        colors.append("green")
    elif feature in res["rejectedFeatures"]:
        colors.append("red")
    else:
        # Implicitly rejected (not processed) features
        colors.append("blue")

# Plot the bar chart with the custom colors
plt.figure(figsize=(12, 6))
plt.bar(features, correlations, color=colors)
plt.title('Relevancy of each feature', pad=20)
plt.ylabel('Pearson Correlation Coefficient (0 to 1)')
plt.xticks(rotation=90)

# Create a custom legend to clarify the colors
selected_patch = mpatches.Patch(color="green", label="Selected Features")
rejected_patch = mpatches.Patch(color="red", label="Rejected Features")
implicit_patch = mpatches.Patch(color="blue", label="Implicitly Rejected Features")
plt.legend(handles=[selected_patch, rejected_patch, implicit_patch])

plt.show()
"""

'\n# RELEVANCY GRAPH\n\nimport matplotlib.pyplot as plt\nimport matplotlib.patches as mpatches\n\n# Load and process data\ndf = pd.read_csv("data2.csv")\ndf = dataClean(df, set = 2)\ndf = predClassMapping(df, set = 2)\n\n# Compute correlations and perform CFS\nR_XY = df.corr()[\'diagnosis\'].abs().sort_values(ascending=False)\n# Remove the \'diagnosis\' entry from the correlation Series\nR_XY = R_XY.drop(\'diagnosis\')\n\nres = corrFeatureSelection(df, k=6, tauRedundancy=0.8)\n\n# Extract features and their correlation values (without \'diagnosis\')\nfeatures = R_XY.index\ncorrelations = R_XY.values\n\n# Create a color list for each feature based on its status:\ncolors = []\nfor feature in features:\n    if feature in res["selectedFeatures"]:\n        colors.append("green")\n    elif feature in res["rejectedFeatures"]:\n        colors.append("red")\n    else:\n        # Implicitly rejected (not processed) features\n        colors.append("blue")\n\n# Plot the bar chart with the custom c

In [32]:
# 10/24
# SPLIT DATA SET
import sklearn as skl
from sklearn.model_selection import train_test_split


def splitData(df, testSize = 0.20):
  # Split the label column from the features
  Y = df.loc[:, 'diagnosis']
  X = df.loc[:, df.columns != 'diagnosis']

  if testSize != 0.0: # We only want to do this if we want to split it into training and testing (non domain adaptation running)
    # Now split the X and Y datasets into train/test (0.8/0.2 split by default)
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=testSize, random_state=23)
    print(type(Xtrain))
    return [Xtrain, Xtest, Ytrain, Ytest]
  else:
    return [X, Y]

In [33]:
# MEAN CENTERING
def centreMean(df: pd.DataFrame) -> pd.DataFrame:
  features = df.columns.to_list()
  if "diagnosis" in features: raise Exception("Cannot zero-mean a dataset with the classifications.")
  newDF = df.copy()
  # find means
  means = []
  for feature in features:
    means.append(df.loc[:, feature].mean())
  
  for i in range(df.shape[0]):
    for j in range(df.shape[1]):
      x_ij = df.iloc[i,j]
      mean = means[j]
      zeroed_x_ij = x_ij - mean
      newDF.iloc[i,j] = zeroed_x_ij
  
  return newDF

In [34]:
##################################

# DEBIASING

##################################

In [35]:
from ipynb.fs.full.debiasing import XYsplit, XYmerge, rndSample, smoteSample, adasynSample
def debiasData(df: pd.DataFrame, technique = "smote") -> pd.DataFrame:
  X, Y = XYsplit(df)
  newX, newY = None, None
  match(technique):
    case "over":
      newX, newY = rndSample(X, Y, mode = "over")
    case "under":
      newX, newY = rndSample(X, Y, mode = "under")
    case "smote":
      newX, newY = smoteSample(X, Y)
    case "adasyn":
      newX, newY = adasynSample(X, Y)
    case _: 
      raise Exception("technique parameter is empty. must be one of: ['over', 'under', 'smote', 'adasyn']")
  
  df = XYmerge(newX, newY)
  return df


In [36]:
# 01/02/2025
# This is the data preprocessing handler. Can be used to execute any one or multiple forms of data preprocessing (cleaning, mapping, feature selection, data splitting).
# NOTE: If data set splitting and CFS are to be done in the same call, the CFS option MUST be first in the `processes` array (so that the dataset is feature selected, and THEN split)
def dataPreProcessing(
    dsFile = "../data/data.csv", Dset = 1, df = None, processes = ["clean", "predMap", "CFS", "centreMean", "splitSet", "debiasing"], 
    kFeatures = 10, tauRedundancy = 0.8, testSize = 0.20, debiasTechnique = "smote"
):
  X, Y = None, None
  returnVars = {} #{"df": None, "CFS Corr Matrix": None, "Xtrain": None, "Xtest": None, "Ytrain": None, "Ytest": None}
  if df is None:
    df = pd.read_csv(dsFile + '.csv')
  for process in processes:
    match process:
      case "clean":
        df = dataClean(df, Dset)
      case "predMap":
        df = predClassMapping(df, Dset)
      case "CFS":
        res = corrFeatureSelection(df, kFeatures, tauRedundancy)
        returnVars["CFS Corr Matrix"] = res["sfCorrMatrix"]
        df = res["selectedDF"]
      case "centreMean": # USED WITH DOMAIN ADAPTATION
        classes = df.loc[:, "diagnosis"]
        df = df.drop(["diagnosis"], axis = 1)
        df = centreMean(df)
        df["diagnosis"] = classes
      case "splitSet":
        if testSize == 0.0:
          X, Y = splitData(df, testSize)
          returnVars["X"], returnVars["Y"] = X, Y
        else:
          Xtrain, Xtest, Ytrain, Ytest = splitData(df, testSize)
          returnVars["Xtrain"] = Xtrain
          returnVars["Xtest"] = Xtest
          returnVars["Ytrain"] = Ytrain
          returnVars["Ytest"] = Ytest
      case "debiasing":

        if testSize != 0.0: # if NOT doing domain adaptation
          X, Y = Xtrain.copy(), Ytrain.copy() # we want to rename Xtrain and Ytrain so it fits with the following code (i.e. generalise it)
        # Convert them both to dataframes and combine them

        df = X.copy()
        df["diagnosis"] = Y
        df = debiasData(df, technique =  debiasTechnique)

        # Split the debiased dataframe
        dfY = df['diagnosis']
        dfX = df.drop('diagnosis', axis = 1)

        label_counts = dfY.value_counts()

        # Print the counts
        print("DEBIASING USING:", debiasTechnique)
        for label, count in label_counts.items():
            print(f"{label}: {count} rows")
        # Now add them back to `returnVars` under the correct key (depending on if we're doing DA or not).
        if testSize != 0.0:
          returnVars["Xtrain"], returnVars["Ytrain"] = dfX, dfY
        else:
          returnVars["X"], returnVars["Y"] = dfX, dfY
      case _:
        raise Exception("Processes param is empty. Options: ['clean', 'predMap', 'CFS', 'splitSet']")

  returnVars["df"] = df
  return returnVars