In [1]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt

### Definition of input folder

In [2]:
folder_path = "../study"
stats_path = folder_path + "/stats"

### Concatenation of all patient's metrics

In [149]:
## Read the list of subjects and for each subject do the tractography
dest_success = folder_path + "/subjects/subj_list.json"
with open(dest_success, 'r') as file:
    patient_list = json.load(file)
del file

dfs = []
for p_code in patient_list:
    metric_folder = "%s/subjects/%s/dMRI/microstructure/%s_metrics.csv" % (folder_path, p_code, p_code)
    if not os.path.exists(metric_folder):
        print(metric_folder, "doesn't exists")
        continue
    dfs.append(pd.read_csv(metric_folder))
    del metric_folder
del p_code

df = pd.concat(dfs, ignore_index=True)
del dfs

"""
Check that there are the expected number of columns:
23 metrics, 4 measure for each metric, 5 tracts, 4 roi, 2 means right and left
For each tract we measure the number of tracts found
The ID
"""
assert df.columns.size == 23 * 4 * (5+4) * 2 + (5*2) + 1

info_df = pd.read_csv(stats_path + "/info.csv")
non_dMRIfeatures = info_df.columns.drop(["ID"])

df = pd.merge(info_df, df, on="ID")
del info_df


if not os.path.isdir(stats_path):
    os.mkdir(stats_path)

df.to_csv("%s/dataset.csv" % stats_path, index=False)


# Analysis

In [162]:
non_dMRIfeatures = ['NR-RP/R', 'NR-RP-R', 'age', 'therapy_duration', 'sex', 'AEDs', 'benzo', 'epilepsy_onset_age', 'epilepsy_type', 'epilepsy_duration']

In [190]:
# Reading the whole dataset
df = pd.read_csv("%s/dataset.csv" % stats_path, index_col="ID")
# Data frame with only the number of tracts
nTract_col_name = df.columns.str.extractall(r'(^.*_nTracts$)')[0]
df_nTracts = df[nTract_col_name]
# Data frame of information
df_info = df[non_dMRIfeatures]
# Data frame of dMRI metrics
df_dMRI = df.drop([*(nTract_col_name.to_list()), *non_dMRIfeatures], axis=1)

### Check for NaN values

In [193]:
(df.isnull().sum(axis=0) == 1) # only the features with NaN values
(df.isnull().sum(axis=1) > 0) # only the subjects with NaN valuse
df_dMRI.loc[(df.isnull().sum(axis=1) > 0), (df.isnull().sum(axis=0) == 1)]

df_dMRI.loc[(df_dMRI == 0).sum(axis=1) > 0, (df_dMRI == 0).sum(axis=0) > 0]



Unnamed: 0_level_0,left-fornix_FA_std,left-fornix_FA_skew,left-fornix_FA_kurt,right-fornix_FA_std,right-fornix_FA_skew,right-fornix_FA_kurt,right-thalamus-AntCingCtx_FA_std,right-thalamus-AntCingCtx_FA_skew,right-thalamus-AntCingCtx_FA_kurt,left-fornix_AD_std,...,right-thalamus-AntCingCtx_frac_f1_kurt,left-fornix_frac_csf_mf_std,left-fornix_frac_csf_mf_skew,left-fornix_frac_csf_mf_kurt,right-fornix_frac_csf_mf_std,right-fornix_frac_csf_mf_skew,right-fornix_frac_csf_mf_kurt,right-thalamus-AntCingCtx_frac_csf_mf_std,right-thalamus-AntCingCtx_frac_csf_mf_skew,right-thalamus-AntCingCtx_frac_csf_mf_kurt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VNSLC_03,0.0,0.0,0.0,0.0,0.0,0.0,0.186111,-0.000744,0.002571,0.0,...,0.003037,0.0,0.0,0.0,0.0,0.0,0.0,0.130877,0.005469,0.00309
VNSLC_17,0.225142,0.002757,0.004963,0.0,0.0,0.0,0.0,0.0,0.0,0.000247,...,0.0,0.272125,0.002817,0.012154,0.0,0.0,0.0,0.0,0.0,0.0


### Analysis over the number of streamline for tract

We have generated 2M of seed to generate the streamlines for each ROI tract. So, there are some patients where with 2M seed in the fornix tract have generated 4k streamlines for the fornix and some that have generated 0 streamlines.

This is a value that depends on the physical structure of the tract in the patient. A small value means a damaged or missing structure, because wan't found any streamline that compose the ROI.

In [54]:
nTract_col_name = df.columns.str.extractall(r'(^.*_nTracts$)')[0]
df_nTracts = df[nTract_col_name]
df_nTracts

Unnamed: 0_level_0,left-inf-longi-fasci_nTracts,left-fornix_nTracts,right-inf-longi-fasci_nTracts,left-sup-longi-fasci_nTracts,right-fornix_nTracts,left-thalamus-Insula_nTracts,right-thalamus-AntCingCtx_nTracts,right-sup-longi-fasci_nTracts,right-thalamus-Insula_nTracts,left-thalamus-AntCingCtx_nTracts
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
VNSLC_01,42903,32,66834,28058,28,16187,194,19376,36875,235
VNSLC_02,63466,1720,96044,32742,2145,21647,523,29357,6458,2346
VNSLC_03,40162,0,65551,22072,0,33874,253,22044,13176,1
VNSLC_04,96262,2,89939,17173,4601,23391,105,31010,29012,1
VNSLC_06,54671,37,67813,16033,34,30317,145,14812,29050,1366
VNSLC_07,31257,1915,59925,18419,43,6541,48,25904,13774,2249
VNSLC_09,119642,291,57338,22827,100,14495,1737,22654,8915,230
VNSLC_10,45948,1044,48804,21129,657,80509,60,21113,14074,24
VNSLC_11,61950,2914,59449,22187,3964,43390,2265,20561,21880,408
VNSLC_12,25198,68,30518,22624,21,81894,1429,4020,139816,6733


In [70]:
missingTracts4subject = (df_nTracts == 0).sum(axis=1) # number of missing ROI tracts for subject
df_nTracts[missingTracts4subject != 0]

Unnamed: 0_level_0,left-inf-longi-fasci_nTracts,left-fornix_nTracts,right-inf-longi-fasci_nTracts,left-sup-longi-fasci_nTracts,right-fornix_nTracts,left-thalamus-Insula_nTracts,right-thalamus-AntCingCtx_nTracts,right-sup-longi-fasci_nTracts,right-thalamus-Insula_nTracts,left-thalamus-AntCingCtx_nTracts
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
VNSLC_03,40162,0,65551,22072,0,33874,253,22044,13176,1
VNSLC_17,81081,79,95533,18835,0,47258,0,25588,41427,103


### Normalization

Unnamed: 0_level_0,age,therapy_duration,sex,AEDs,benzo,epilepsy_onset_age,epilepsy_type,epilepsy_duration,left-inf-longi-fasci_nTracts,left-inf-longi-fasci_FA_mean,...,right-hippocampus_frac_csf_mf_skew,right-hippocampus_frac_csf_mf_kurt,right-amygdala_frac_csf_mf_mean,right-amygdala_frac_csf_mf_std,right-amygdala_frac_csf_mf_skew,right-amygdala_frac_csf_mf_kurt,right-accumbens-area_frac_csf_mf_mean,right-accumbens-area_frac_csf_mf_std,right-accumbens-area_frac_csf_mf_skew,right-accumbens-area_frac_csf_mf_kurt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VNSLC_01,36,93,1,3,0,5,1,31,42903,0.541066,...,0.023179,0.018156,0.137069,0.138784,0.0069,0.00393,0.103991,0.118798,0.003639,0.001688
VNSLC_02,29,13,2,2,0,14,1,15,63466,0.590096,...,0.060498,0.049116,0.218807,0.29568,0.048082,0.039077,0.087669,0.115187,0.011297,0.010479
VNSLC_03,65,234,1,3,0,15,1,50,40162,0.521688,...,0.038515,0.031084,0.267605,0.248778,0.024306,0.019431,0.123216,0.046279,-5.9e-05,2.5e-05
VNSLC_04,34,11,2,2,0,8,2,26,96262,0.490483,...,0.04991,0.040012,0.137749,0.19797,0.024946,0.021099,0.076892,0.136489,0.014051,0.012932
VNSLC_06,62,136,2,2,0,36,2,26,54671,0.54952,...,0.043707,0.035947,0.155287,0.243089,0.04069,0.034923,0.049738,0.068137,0.000819,0.000304
VNSLC_07,46,216,2,2,0,5,2,41,31257,0.56278,...,0.044037,0.035542,0.163955,0.229123,0.033496,0.028361,0.109023,0.159611,0.01904,0.017348
VNSLC_09,47,13,2,3,0,2,2,45,119642,0.579634,...,0.049708,0.040811,0.1884,0.278626,0.046461,0.038423,0.039018,0.051299,0.000775,0.000436
VNSLC_10,21,30,1,4,1,6,2,15,45948,0.591488,...,0.039784,0.032312,0.148032,0.207837,0.027697,0.023624,0.113611,0.186089,0.025344,0.022526
VNSLC_11,46,6,2,3,0,0,2,46,61950,0.534228,...,0.011445,0.023089,0.575384,0.363802,-0.000202,0.025311,0.083526,0.144889,0.014706,0.013351
VNSLC_12,31,134,1,3,0,18,2,13,25198,0.533906,...,0.031408,0.024625,0.152687,0.211416,0.022127,0.017,0.088558,0.204791,0.035422,0.032747


# Division between features and label

In this first analysis we consider a binary classification problem. Therefore, we keep the NR-RP/R labels, in which the Partial responders are considered as Responders.

*In next analysis we will consider a multiclass classification problem, trying to classify all the three different types* 

In [11]:
y = df["NR-RP/R"]
X = df.drop(["NR-RP/R", "NR-RP-R"], axis=1) 

Here the explanation of some features:

NR-RP/R:

- NR-RP/R == 0 => Non Responder
- NR-RP/R == 1 => Partial responder or Responder

NR-RP-R:

- NR-RP-R == 0 => Non Responder
- NR-RP-R == 1 => Partial Responder
- NR-RP-R == 2 => Responder

epilepsy_type:

- epilepsy_type == 1 => Generalized
- epilepsy_type == 2 => Focal

In [12]:
X.describe()

Unnamed: 0,age,therapy_duration,sex,AEDs,benzo,epilepsy_onset_age,epilepsy_type,epilepsy_duration,left-inf-longi-fasci_nTracts,left-inf-longi-fasci_FA_mean,...,right-hippocampus_frac_csf_mf_skew,right-hippocampus_frac_csf_mf_kurt,right-amygdala_frac_csf_mf_mean,right-amygdala_frac_csf_mf_std,right-amygdala_frac_csf_mf_skew,right-amygdala_frac_csf_mf_kurt,right-accumbens-area_frac_csf_mf_mean,right-accumbens-area_frac_csf_mf_std,right-accumbens-area_frac_csf_mf_skew,right-accumbens-area_frac_csf_mf_kurt
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,...,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,38.666667,90.611111,1.555556,2.833333,0.111111,12.722222,1.833333,28.055556,59257.055556,0.543739,...,0.036395,0.030623,0.206393,0.241785,0.028967,0.025422,0.113486,0.152597,0.018556,0.016452
std,12.078177,74.54816,0.51131,0.785905,0.323381,9.086548,0.383482,12.06586,23980.872553,0.039047,...,0.014571,0.011314,0.114068,0.078855,0.016642,0.013184,0.062781,0.087362,0.018028,0.016063
min,21.0,6.0,1.0,2.0,0.0,0.0,1.0,9.0,25198.0,0.424548,...,0.002495,0.001317,0.114145,0.083668,-0.000202,0.000411,0.039018,0.046279,-5.9e-05,2.5e-05
25%,30.25,13.5,1.0,2.0,0.0,6.0,2.0,18.75,43664.25,0.533987,...,0.027815,0.02319,0.146565,0.208731,0.022672,0.019668,0.078893,0.096344,0.003654,0.002096
50%,35.0,91.5,2.0,3.0,0.0,12.0,2.0,26.0,53947.5,0.54521,...,0.040102,0.032842,0.163038,0.236106,0.029069,0.025069,0.101168,0.145072,0.014379,0.013142
75%,46.0,133.5,2.0,3.0,0.0,18.0,2.0,36.0,71056.75,0.563859,...,0.044145,0.03925,0.213759,0.275783,0.042991,0.035843,0.132212,0.189972,0.027846,0.024605
max,65.0,234.0,2.0,4.0,1.0,36.0,2.0,50.0,119642.0,0.591488,...,0.060498,0.049116,0.575384,0.404089,0.054867,0.045849,0.326714,0.42394,0.068698,0.060285


In [12]:
print(df["NR-RP/R"].value_counts())
print(df["NR-RP-R"].value_counts())

NR-RP/R
1    12
0     6
Name: count, dtype: int64
NR-RP-R
2    8
0    6
1    4
Name: count, dtype: int64


In our case we have few examples (patients), and so the model will not be so accurate. Then the difference between patients responders and not is different, so our model will be bias influenced.

While if we see the number of samples between responders, partial and not, the number of samples for each class is almost equilibrate.

In [14]:
X.groupby(y).mean()

Unnamed: 0_level_0,age,therapy_duration,sex,AEDs,benzo,epilepsy_onset_age,epilepsy_type,epilepsy_duration,left-inf-longi-fasci_nTracts,left-inf-longi-fasci_FA_mean,...,right-hippocampus_frac_csf_mf_skew,right-hippocampus_frac_csf_mf_kurt,right-amygdala_frac_csf_mf_mean,right-amygdala_frac_csf_mf_std,right-amygdala_frac_csf_mf_skew,right-amygdala_frac_csf_mf_kurt,right-accumbens-area_frac_csf_mf_mean,right-accumbens-area_frac_csf_mf_std,right-accumbens-area_frac_csf_mf_skew,right-accumbens-area_frac_csf_mf_kurt
NR-RP/R,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,37.166667,120.5,1.5,3.166667,0.166667,16.5,2.0,27.0,60182.0,0.554742,...,0.035396,0.030051,0.208328,0.267605,0.035387,0.029991,0.129025,0.169892,0.021464,0.018953
1,39.416667,75.666667,1.583333,2.666667,0.083333,10.833333,1.75,28.583333,58794.583333,0.538238,...,0.036895,0.030908,0.205425,0.228874,0.025758,0.023137,0.105716,0.14395,0.017103,0.015201
