In [6]:
##################################

# DOMAIN ADAPTATION
# 25/03/2025 & 26/03/2025

##################################

In [7]:
import numpy as np
import pandas as pd
import scipy.linalg as scila
from ipynb.fs.full.preprocessing import dataPreProcessing
 

In [8]:
def CORAL(dfS: pd.DataFrame, dfT: pd.DataFrame, λ: float = 0.00001, sourLabels = None) -> pd.DataFrame:
  """
  Implementation of the CORAL domain adaptation approach.
  dfS = Source-domain dataset;
  dfT = Target-domain dataset;
  λ = Regulisation parameter for data whitening. ADAPT documentation says λ = 0.00001 is a good default value.

  """
  if 'diagnosis' in list(dfT.columns): raise Exception("`dftT` (target domain dataset) should be UNLABELLED.")
  if dfS.shape[1] != dfT.shape[1]: raise Exception("Both datasets must have the same number of features (p). This can be done by applying CFS in the `dataPreProcessing` function with `k = p`.")
  D_S, D_T = dfS.to_numpy(), dfT.to_numpy()
  Ip = np.eye(D_S.shape[1])
  C_S = np.cov(D_S, rowvar = False) + λ*Ip
  C_T = np.cov(D_T, rowvar = False) + λ*Ip
  D_Stemp = D_S @ scila.fractional_matrix_power(C_S, -0.5)
  D_SEnc = D_Stemp @ scila.fractional_matrix_power(C_T, 0.5)
  dfSEnc = convToDF(colNames = dfT.columns.to_list(), data = D_SEnc, labels = sourLabels)

  return dfSEnc

def convToDF(colNames: list, data: np.array, labels: list) -> pd.DataFrame:
  if len(colNames) != data.shape[1]: raise Exception ("Number of column names should be equal to number of columns in data.")
  df = pd.DataFrame(data=data[0:,0:], index=[i for i in range(data.shape[0])], columns=[colNames[i] for i in range(data.shape[1])])
  df["diagnosis"] = labels
  return df


In [9]:
# 26/03/2025
# Driver Code
source = pd.read_csv("../data/data.csv")
target = pd.read_csv("../data/data2.csv")
varsSource = dataPreProcessing(Dset = 1, df = source, processes = ["clean", "predMap", "CFS", "centreMean"], kFeatures = 6, tauRedundancy = 0.8)
varsTar = dataPreProcessing(Dset = 2, df = target, processes = ["clean", "predMap", "centreMean"])
source, target = varsSource["df"], varsTar["df"]

targetLabels, sourceLabels = target["diagnosis"], source["diagnosis"]
sourceNoLabel, targetNoLabel = source.drop(["diagnosis"], axis = 1), target.drop(["diagnosis"], axis = 1)
dfSEnc = CORAL(dfS = sourceNoLabel, dfT = targetNoLabel, sourLabels = sourceLabels)
dfSEnc.to_csv("../data/data1Encoded.csv", index=False)


grjtgj
Number of NANs in data set 1 : id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
f

  "\n",


grjtgj
Number of NANs in data set 2 : Area                 0
Grey Level           0
Gradient Strength    0
Noise Fluctuation    0
Contrast             0
Shape Descriptor     0
diagnosis            0
dtype: int64
Preparing to interpolate NANs...
Number of NANs now:  Area                 0
Grey Level           0
Gradient Strength    0
Noise Fluctuation    0
Contrast             0
Shape Descriptor     0
diagnosis            0
dtype: int64
grjtgj
grjtgj


  "\n",
