# **Preprocessing & Merging: Proteomics and MRI Data**

This notebook preprocesses **proteomics and MRI data**, ensuring proper alignment, cleaning, and integration of both datasets for downstream analysis.


In [86]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

In [2]:
# load proteomics data
proteomics_file = r"C:\Users\Romina\OneDrive\Desktop\Project\big_dataset_cols_needed.csv"
proteins = pd.read_csv(proteomics_file)

  proteins = pd.read_csv(proteomics_file)


In [3]:
proteins

Unnamed: 0,EID,Sex,Ethnic_background,Age_at_recruitment,Diagnoses_main_ICD10,Diagnoses_main_ICD10_1,Diagnoses_main_ICD10_2,Diagnoses_main_ICD10_3,Diagnoses_main_ICD10_4,Diagnoses_main_ICD10_5,...,ZNF75D,ZNF830,ZNRD2,ZNRF4,ZP3,ZP4,ZPR1,PlateID,plate,Instance
0,1000024,0,1001.0,67,F019,G309,I48,I620,I639,M169,...,0.2032,0.14900,-1.06535,-0.5158,2.3663,-0.2875,0.9014,890000000000,135,0
1,1000043,1,1001.0,65,,,,,,,...,-0.5470,-0.11440,1.42705,0.8249,-5.1230,-0.1256,0.1586,890000000000,76,0
2,1000156,0,1001.0,62,E871,H258,H269,R074,,,...,-0.3359,0.63980,-0.58285,0.3991,0.8371,0.3501,2.3894,890000000000,243,0
3,1000217,1,1003.0,63,C060,I269,R509,R69,,,...,,,,,2.9152,-0.2739,,890000000000,155,0
4,1000309,1,4002.0,60,,,,,,,...,,,,,,,,890000000000,590,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,6023140,0,1001.0,55,H264,H269,H521,I259,I319,M201,...,-0.0990,-0.20920,0.00375,-0.7296,-0.5126,-0.0534,-0.2568,890000000000,491,0
52696,6023206,1,2004.0,64,C447,D509,E831,G562,I839,K219,...,,0.10650,1.22565,0.5148,-4.6300,-0.0323,-0.2688,890000000000,237,0
52697,6023457,1,1001.0,48,D125,I841,I848,K621,K640,Q433,...,-0.6969,-0.04040,0.02195,0.3686,0.6616,0.2474,0.0603,890000000000,316,0
52698,6023548,1,1001.0,62,C155,C159,C160,C675,C679,K918,...,0.3210,0.89850,0.63595,-0.2934,-5.0217,0.1652,-0.7583,890000000000,481,0


In [7]:
unwanted_cols = ['PlateID', 'plate', 'Instance']
proteins.drop(columns=unwanted_cols, inplace=True, errors='ignore')

In [None]:
print(proteins.columns.tolist())

Columns after dropping unwanted ones:
['EID', 'Sex', 'Ethnic_background', 'Age_at_recruitment', 'Diagnoses_main_ICD10', 'Diagnoses_main_ICD10_1', 'Diagnoses_main_ICD10_2', 'Diagnoses_main_ICD10_3', 'Diagnoses_main_ICD10_4', 'Diagnoses_main_ICD10_5', 'Diagnoses_main_ICD10_6', 'Diagnoses_main_ICD10_7', 'Diagnoses_main_ICD10_8', 'Diagnoses_main_ICD10_9', 'Diagnoses_main_ICD10_10', 'Diagnoses_main_ICD10_11', 'Diagnoses_main_ICD10_12', 'Diagnoses_main_ICD10_13', 'Diagnoses_main_ICD10_14', 'Diagnoses_main_ICD10_15', 'Diagnoses_main_ICD10_16', 'Diagnoses_main_ICD10_17', 'Diagnoses_main_ICD10_18', 'Diagnoses_main_ICD10_19', 'Diagnoses_main_ICD10_20', 'Diagnoses_main_ICD10_21', 'Diagnoses_main_ICD10_22', 'Diagnoses_main_ICD10_23', 'Diagnoses_main_ICD10_24', 'Diagnoses_main_ICD10_25', 'Diagnoses_main_ICD10_26', 'Diagnoses_main_ICD10_27', 'Diagnoses_main_ICD10_28', 'Diagnoses_main_ICD10_29', 'Diagnoses_main_ICD10_30', 'Diagnoses_main_ICD10_31', 'Diagnoses_main_ICD10_32', 'Diagnoses_main_ICD10_33'

In [9]:
start_prot_idx = proteins.columns.tolist().index('A1BG')
protein_columns_all = proteins.columns[start_prot_idx:]
prot_cols = [col for col in protein_columns_all if '_LOD' not in col]
metadata_cols = proteins.columns[:start_prot_idx]
metadata_df = proteins[metadata_cols].copy()

In [10]:
print("Metadata columns:", metadata_df.columns.tolist())
print("Protein columns (no '_LOD'):", prot_cols[:10], "...", len(prot_cols), "total")

Metadata columns: ['EID', 'Sex', 'Ethnic_background', 'Age_at_recruitment', 'Diagnoses_main_ICD10', 'Diagnoses_main_ICD10_1', 'Diagnoses_main_ICD10_2', 'Diagnoses_main_ICD10_3', 'Diagnoses_main_ICD10_4', 'Diagnoses_main_ICD10_5', 'Diagnoses_main_ICD10_6', 'Diagnoses_main_ICD10_7', 'Diagnoses_main_ICD10_8', 'Diagnoses_main_ICD10_9', 'Diagnoses_main_ICD10_10', 'Diagnoses_main_ICD10_11', 'Diagnoses_main_ICD10_12', 'Diagnoses_main_ICD10_13', 'Diagnoses_main_ICD10_14', 'Diagnoses_main_ICD10_15', 'Diagnoses_main_ICD10_16', 'Diagnoses_main_ICD10_17', 'Diagnoses_main_ICD10_18', 'Diagnoses_main_ICD10_19', 'Diagnoses_main_ICD10_20', 'Diagnoses_main_ICD10_21', 'Diagnoses_main_ICD10_22', 'Diagnoses_main_ICD10_23', 'Diagnoses_main_ICD10_24', 'Diagnoses_main_ICD10_25', 'Diagnoses_main_ICD10_26', 'Diagnoses_main_ICD10_27', 'Diagnoses_main_ICD10_28', 'Diagnoses_main_ICD10_29', 'Diagnoses_main_ICD10_30', 'Diagnoses_main_ICD10_31', 'Diagnoses_main_ICD10_32', 'Diagnoses_main_ICD10_33', 'Diagnoses_main_IC

In [11]:
# Threshold outlier removal
threshold_value = 5
for prot in prot_cols:
    # Find rows where absolute value of protein expression > threshold
    to_remove = proteins.loc[abs(proteins[prot]) > threshold_value].index
    # Set those out-of-range values to NaN
    proteins.loc[to_remove, prot] = np.nan

In [13]:
print(proteins[prot_cols].describe().T.head(10))

            count      mean       std      min       25%       50%       75%  \
A1BG      44542.0 -0.004431  0.190406 -1.24600 -0.116000  0.000000  0.116575   
AAMDC     43296.0  0.021994  0.548569 -1.97485 -0.341050  0.003350  0.360050   
AARSD1    51291.0  0.019698  0.645765 -4.52005 -0.396450  0.000000  0.406575   
ABCA2     43014.0  0.012461  0.373312 -1.59610 -0.216138  0.001725  0.216400   
ABHD14B   50965.0  0.017392  0.645501 -4.71210 -0.398000  0.000000  0.403250   
ABL1      51285.0  0.072772  0.772170 -2.67535 -0.473700 -0.007500  0.525900   
ABO       44510.0 -0.166062  1.495067 -4.83500 -1.557600  0.000000  0.749700   
ABRAXAS2  43014.0  0.041498  0.683817 -2.41280 -0.429900 -0.000300  0.466175   
ACAA1     50301.0  0.066382  0.926460 -4.26980 -0.548450 -0.009000  0.585000   
ACADM     43553.0  0.120402  0.654902 -2.50780 -0.249100 -0.001300  0.299600   

              max  
A1BG      1.47870  
AAMDC     3.93705  
AARSD1    4.04045  
ABCA2     4.58580  
ABHD14B   4.20725  

In [15]:
prot_df = proteins[prot_cols].copy()
prot_df

Unnamed: 0,A1BG,AAMDC,AARSD1,ABCA2,ABHD14B,ABL1,ABO,ABRAXAS2,ACAA1,ACADM,...,ZFYVE19,ZHX2,ZNF174,ZNF75D,ZNF830,ZNRD2,ZNRF4,ZP3,ZP4,ZPR1
0,0.0759,0.04665,-1.13740,-0.20055,-0.42250,-0.80600,-0.4521,-1.0093,-0.26450,-0.4324,...,-1.0956,-0.1025,-0.0143,0.2032,0.14900,-1.06535,-0.5158,2.3663,-0.2875,0.9014
1,-0.1648,0.91945,1.21405,0.15040,1.52505,0.71830,-2.8851,0.7212,0.66275,-0.1997,...,1.4548,-0.0661,0.3957,-0.5470,-0.11440,1.42705,0.8249,,-0.1256,0.1586
2,0.1087,0.31715,0.26120,-0.31015,-0.79480,-0.18310,0.4929,0.0694,-0.50230,0.3186,...,-0.1806,0.0838,-0.0295,-0.3359,0.63980,-0.58285,0.3991,0.8371,0.3501,2.3894
3,-0.1850,0.14295,-0.36740,0.50395,0.06640,0.57910,-0.3682,0.6413,0.37140,,...,,0.1480,0.2104,,,,,2.9152,-0.2739,
4,,,0.34685,,-0.36535,0.06215,,,-0.30340,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,0.1993,-0.39425,-0.54145,-0.10320,-0.25920,-0.35800,-1.0371,-0.5437,0.25230,-0.0479,...,-0.5600,0.3048,-0.2222,-0.0990,-0.20920,0.00375,-0.7296,-0.5126,-0.0534,-0.2568
52696,0.0341,0.86505,0.89795,-0.27340,0.07450,0.32140,-2.1897,0.8779,-0.06540,,...,0.8236,0.3024,0.4840,,0.10650,1.22565,0.5148,-4.6300,-0.0323,-0.2688
52697,-0.0031,-0.04665,0.49065,-0.06470,0.58825,-0.05300,0.1868,0.2847,-0.20530,0.0903,...,0.3370,-0.0885,,-0.6969,-0.04040,0.02195,0.3686,0.6616,0.2474,0.0603
52698,0.0531,-0.12765,0.29745,-0.01050,-0.06790,-0.63820,-1.6101,-0.9090,-0.23760,-0.4812,...,-1.0266,0.0499,0.0342,0.3210,0.89850,0.63595,-0.2934,,0.1652,-0.7583


In [16]:
# impute missing values
imputer = SimpleImputer(strategy="mean")
imputed_values = imputer.fit_transform(prot_df)
prot_imputed_df = pd.DataFrame(imputed_values, columns=prot_df.columns)
print("Imputed shape:", prot_imputed_df.shape)
print(prot_imputed_df.isnull().sum().sum(), "missing values remain")

Imputed shape: (52700, 2923)
0 missing values remain


In [17]:
# Scale the protein features
scaler = StandardScaler()
scaled_array = scaler.fit_transform(prot_imputed_df)
scaled_proteins_df = pd.DataFrame(scaled_array, 
                                  columns=prot_imputed_df.columns, 
                                  index=prot_imputed_df.index)

scaled_proteins_df

Unnamed: 0,A1BG,AAMDC,AARSD1,ABCA2,ABHD14B,ABL1,ABO,ABRAXAS2,ACAA1,ACADM,...,ZFYVE19,ZHX2,ZNF174,ZNF75D,ZNF830,ZNRD2,ZNRF4,ZP3,ZP4,ZPR1
0,4.589104e-01,0.049589,-1.816289,-6.315904e-01,-0.692984,-1.153658,-2.081826e-01,-1.700924e+00,-0.365568,-9.285266e-01,...,-1.292154e+00,-0.450031,-1.884634e-01,0.301836,-0.136744,-1.723948e+00,-1.069217,1.512075,-1.092510e+00,9.121869e-01
1,-9.161484e-01,1.804964,1.874766,4.089987e-01,2.375087,0.847455,-1.978955e+00,1.100232e+00,0.658884,-5.376665e-01,...,1.689146e+00,-0.337896,1.010863e+00,-1.086189,-0.489580,2.008019e+00,1.567717,0.000000,-5.385477e-01,-3.585828e-02
2,6.462886e-01,0.593618,0.379084,-9.565613e-01,-1.279487,-0.335910,4.796019e-01,4.516504e-02,-0.628296,3.329085e-01,...,-2.225614e-01,0.123892,-2.329263e-01,-0.695610,0.520704,-1.001482e+00,0.730239,0.646381,1.089124e+00,2.811340e+00
3,-1.031546e+00,0.243267,-0.607625,1.457297e+00,0.077204,0.664712,-1.471190e-01,9.708984e-01,0.336992,2.331016e-17,...,6.083435e-18,0.321668,4.688260e-01,0.000000,0.000000,2.077975e-17,0.000000,1.822812,-1.045976e+00,3.542480e-17
4,9.910041e-18,0.000000,0.513528,5.143565e-18,-0.602953,-0.013944,2.020091e-17,-2.246395e-17,-0.408546,2.331016e-17,...,6.083435e-18,0.000000,-2.029756e-17,0.000000,0.000000,2.077975e-17,0.000000,0.000000,2.374236e-17,3.542480e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,1.163864e+00,-0.837149,-0.880830,-3.429415e-01,-0.435730,-0.565520,-6.339540e-01,-9.472587e-01,0.205407,-2.826920e-01,...,-6.660624e-01,0.804712,-7.966097e-01,-0.257296,-0.616569,-1.231435e-01,-1.489726,-0.117697,-2.915057e-01,-5.660386e-01
52696,2.201175e-01,1.695554,1.378586,-8.475953e-01,0.089964,0.326401,-1.472833e+00,1.353882e+00,-0.145597,2.331016e-17,...,9.513021e-01,0.797318,1.269157e+00,0.000000,-0.193675,1.706455e+00,0.957802,-2.448594,-2.193092e-01,-5.813544e-01
52697,7.603195e-03,-0.138056,0.739250,-2.287865e-01,0.899300,-0.165114,2.568179e-01,3.936707e-01,-0.300162,-5.056104e-02,...,3.824892e-01,-0.406902,-2.029756e-17,-1.363535,-0.390454,-9.589190e-02,0.670251,0.547028,7.377222e-01,-1.613198e-01
52698,3.286597e-01,-0.300964,0.435985,-6.808006e-02,-0.134365,-0.933369,-1.050992e+00,-1.538569e+00,-0.335848,-1.010495e+00,...,-1.211496e+00,0.019458,-4.659188e-02,0.519791,0.867244,8.234741e-01,-0.631793,0.000000,4.564638e-01,-1.206109e+00


In [23]:
metadata_df.rename(columns={"EID": "eid"}, inplace=True, errors='ignore')

In [24]:
final_proteomics = pd.concat([metadata_df, scaled_proteins_df], axis=1)
final_proteomics

Unnamed: 0,eid,Sex,Ethnic_background,Age_at_recruitment,Diagnoses_main_ICD10,Diagnoses_main_ICD10_1,Diagnoses_main_ICD10_2,Diagnoses_main_ICD10_3,Diagnoses_main_ICD10_4,Diagnoses_main_ICD10_5,...,ZFYVE19,ZHX2,ZNF174,ZNF75D,ZNF830,ZNRD2,ZNRF4,ZP3,ZP4,ZPR1
0,1000024,0,1001.0,67,F019,G309,I48,I620,I639,M169,...,-1.292154e+00,-0.450031,-1.884634e-01,0.301836,-0.136744,-1.723948e+00,-1.069217,1.512075,-1.092510e+00,9.121869e-01
1,1000043,1,1001.0,65,,,,,,,...,1.689146e+00,-0.337896,1.010863e+00,-1.086189,-0.489580,2.008019e+00,1.567717,0.000000,-5.385477e-01,-3.585828e-02
2,1000156,0,1001.0,62,E871,H258,H269,R074,,,...,-2.225614e-01,0.123892,-2.329263e-01,-0.695610,0.520704,-1.001482e+00,0.730239,0.646381,1.089124e+00,2.811340e+00
3,1000217,1,1003.0,63,C060,I269,R509,R69,,,...,6.083435e-18,0.321668,4.688260e-01,0.000000,0.000000,2.077975e-17,0.000000,1.822812,-1.045976e+00,3.542480e-17
4,1000309,1,4002.0,60,,,,,,,...,6.083435e-18,0.000000,-2.029756e-17,0.000000,0.000000,2.077975e-17,0.000000,0.000000,2.374236e-17,3.542480e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,6023140,0,1001.0,55,H264,H269,H521,I259,I319,M201,...,-6.660624e-01,0.804712,-7.966097e-01,-0.257296,-0.616569,-1.231435e-01,-1.489726,-0.117697,-2.915057e-01,-5.660386e-01
52696,6023206,1,2004.0,64,C447,D509,E831,G562,I839,K219,...,9.513021e-01,0.797318,1.269157e+00,0.000000,-0.193675,1.706455e+00,0.957802,-2.448594,-2.193092e-01,-5.813544e-01
52697,6023457,1,1001.0,48,D125,I841,I848,K621,K640,Q433,...,3.824892e-01,-0.406902,-2.029756e-17,-1.363535,-0.390454,-9.589190e-02,0.670251,0.547028,7.377222e-01,-1.613198e-01
52698,6023548,1,1001.0,62,C155,C159,C160,C675,C679,K918,...,-1.211496e+00,0.019458,-4.659188e-02,0.519791,0.867244,8.234741e-01,-0.631793,0.000000,4.564638e-01,-1.206109e+00


In [25]:
final_proteomics.to_csv("proteomics_preprocessed.csv", index=False)
print("File saved.")

File saved.


**Preprocess MRI Data for Shared Participants**
- **Filtered MRI dataset** to only include the **4,698** participants with MRI data.
- **Dropped columns with >50% missing values**.
- **Removed outliers using Z-score** (values beyond ±3 standard deviations were set to `NaN`).
- **Imputed missing values using mean** to fill gaps in MRI data.

In [1]:
# load imaging data
mri_file = r"c:\Users\Romina\Downloads\ukb674360 (1).txt"

In [4]:
mri_preview = pd.read_csv(mri_file, nrows=5, sep="\t")

In [5]:
columns_to_include = ['eid', '31-0.0', '34-0.0'] + [ col for col in mri_preview.columns if '25011-2.0' <= col <= '30000-0.0']

In [10]:
mri_data = pd.read_csv( mri_file, usecols=columns_to_include, sep="\t", on_bad_lines='skip')

In [11]:
mri_data

Unnamed: 0,Unnamed: 1,eid,31-0.0,34-0.0,2966-0.0,2966-1.0,2966-2.0,2966-3.0,2976-0.0,2976-1.0,2976-2.0,...,27324-3.0,27325-2.0,27325-3.0,27326-2.0,27326-3.0,27327-2.0,27327-3.0,27328-2.0,27328-3.0,30000-0.0
,1000018,1.0,1954,1045.0,,,,,,,,...,,,,,,,,,5.22,
,1000024,1.0,1940,1051.0,,,,,,,,...,,,,,,,,,7.50,
,1000031,1.0,1948,1058.0,,,,,,,,...,,,,,,,,,8.27,
,1000043,1.0,1945,2817.0,,,,,,,,...,16217.0,,11242.0,,915.0,,6353.0,,5.22,
,1000059,1.0,1952,1045.0,,,,,,,,...,,,,,,,,,7.70,
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,4946719,1.0,1942,2815.0,,,,,,,,...,,,,,,,,,5.19,
,4946727,1.0,1943,156.0,,,,,,,,...,,,,,,,,,6.19,
,4946736,1.0,1949,2816.0,,,,57.0,,,,...,,,,,,,,,4.57,
,4946742,1.0,1958,2817.0,,,,,,,,...,,,,,,,,,5.80,


In [12]:
if isinstance(mri_data.index, pd.MultiIndex):
    mri_data = mri_data.reset_index() 

In [13]:
mri_data

Unnamed: 0,level_0,level_1,eid,31-0.0,34-0.0,2966-0.0,2966-1.0,2966-2.0,2966-3.0,2976-0.0,...,27324-3.0,27325-2.0,27325-3.0,27326-2.0,27326-3.0,27327-2.0,27327-3.0,27328-2.0,27328-3.0,30000-0.0
0,,1000018,1.0,1954,1045.0,,,,,,...,,,,,,,,,5.22,
1,,1000024,1.0,1940,1051.0,,,,,,...,,,,,,,,,7.50,
2,,1000031,1.0,1948,1058.0,,,,,,...,,,,,,,,,8.27,
3,,1000043,1.0,1945,2817.0,,,,,,...,16217.0,,11242.0,,915.0,,6353.0,,5.22,
4,,1000059,1.0,1952,1045.0,,,,,,...,,,,,,,,,7.70,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394663,,4946719,1.0,1942,2815.0,,,,,,...,,,,,,,,,5.19,
394664,,4946727,1.0,1943,156.0,,,,,,...,,,,,,,,,6.19,
394665,,4946736,1.0,1949,2816.0,,,,57.0,,...,,,,,,,,,4.57,
394666,,4946742,1.0,1958,2817.0,,,,,,...,,,,,,,,,5.80,


In [14]:
mri_data.drop(columns=['level_0'], inplace=True)

In [16]:
mri_data.rename(columns={'level_1': 'temp_eid'}, inplace=True)

In [22]:
old_cols = mri_data.columns.tolist()
num_cols = len(old_cols)


In [23]:
new_cols = [None] * num_cols

In [24]:
new_cols[0] = "eid"

In [25]:
for i in range(1, num_cols - 1):
    new_cols[i] = old_cols[i + 1]

In [26]:
new_cols[-1] = "unknown"

In [27]:
mri_data.columns = new_cols

In [39]:
columns_to_drop = ["unknown", "31-0.0", "34-0.0", "2966-0.0", "2966-1.0", "2966-2.0", "2966-3.0", "2976-0.0", "2976-1.0", "2976-2.0", "2976-3.0", "2986-0.0", "2986-1.0", "2986-2.0", "2986-3.0", "26302-2.0", "26302-3.0",
    "26306-2.0", "26306-3.0",
    "26410-0.0", "26411-0.0", "26412-0.0", "26413-0.0",
    "26414-0.0", "26415-0.0", "26416-0.0", "26417-0.0",
    "26418-0.0", "26419-0.0", "26420-0.0", "26421-0.0",
    "26422-0.0", "26423-0.0", "26424-0.0", "26425-0.0",
    "26426-0.0", "26427-0.0", "26428-0.0", "26429-0.0",
    "26430-0.0", "26431-0.0", "26432-0.0", "26433-0.0",
    "26434-0.0", "30000-0.0"]
mri_data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [40]:
col_list = mri_data.columns.tolist()
print(col_list)

['eid', '25011-2.0', '25011-3.0', '25012-2.0', '25012-3.0', '25013-2.0', '25013-3.0', '25014-2.0', '25014-3.0', '25015-2.0', '25015-3.0', '25016-2.0', '25016-3.0', '25017-2.0', '25017-3.0', '25018-2.0', '25018-3.0', '25019-2.0', '25019-3.0', '25020-2.0', '25020-3.0', '25021-2.0', '25021-3.0', '25022-2.0', '25022-3.0', '25023-2.0', '25023-3.0', '25024-2.0', '25024-3.0', '25781-2.0', '25781-3.0', '26501-2.0', '26501-3.0', '26502-2.0', '26502-3.0', '26503-2.0', '26503-3.0', '26504-2.0', '26504-3.0', '26505-2.0', '26505-3.0', '26506-2.0', '26506-3.0', '26507-2.0', '26507-3.0', '26508-2.0', '26508-3.0', '26509-2.0', '26509-3.0', '26510-2.0', '26510-3.0', '26511-2.0', '26511-3.0', '26512-2.0', '26512-3.0', '26513-2.0', '26513-3.0', '26514-2.0', '26514-3.0', '26515-2.0', '26515-3.0', '26516-2.0', '26516-3.0', '26517-2.0', '26517-3.0', '26518-2.0', '26518-3.0', '26519-2.0', '26519-3.0', '26520-2.0', '26520-3.0', '26521-2.0', '26521-3.0', '26522-2.0', '26522-3.0', '26523-2.0', '26523-3.0', '265

In [41]:
mri_data

Unnamed: 0,eid,25011-2.0,25011-3.0,25012-2.0,25012-3.0,25013-2.0,25013-3.0,25014-2.0,25014-3.0,25015-2.0,...,27324-2.0,27324-3.0,27325-2.0,27325-3.0,27326-2.0,27326-3.0,27327-2.0,27327-3.0,27328-2.0,27328-3.0
0,1000018,,,,,,,,,,...,,,,,,,,,,
1,1000024,,,,,,,,,,...,,,,,,,,,,
2,1000031,,,,,,,,,,...,,,,,,,,,,
3,1000043,,,7232.0,,3342.0,,3595.0,,5108.0,...,11672.0,,16217.0,,11242.0,,915.0,,6353.0,
4,1000059,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394663,4946719,,,,,,,,,,...,,,,,,,,,,
394664,4946727,,,,,,,,,,...,,,,,,,,,,
394665,4946736,,,,,,,,,,...,,,,,,,,,,
394666,4946742,,,,,,,,,,...,,,,,,,,,,


In [42]:
mri_data.to_csv("correct but original.csv", index=False)
print("File saved.")

File saved.


In [76]:
proteomics_file = r"C:\Users\Romina\OneDrive\Desktop\Project\Clean Notebooks and files\proteomics_preprocessed.csv"
proteomics_df = pd.read_csv(proteomics_file)

  proteomics_df = pd.read_csv(proteomics_file)


In [75]:
mri_data = r"C:\Users\Romina\OneDrive\Desktop\Project\Clean Notebooks and files\correct but original.csv"
mri_data = pd.read_csv(mri_data)

In [77]:
# keep only shared participants
shared_eids = set(proteomics_df['eid']).intersection(set(mri_data['eid']))
print(f"Number of shared EIDs: {len(shared_eids)}")

Number of shared EIDs: 41305


In [78]:
mri_data_shared = mri_data[mri_data['eid'].isin(shared_eids)]

In [79]:
print("MRI shape (after intersecting with proteomics):", mri_data_shared.shape)

MRI shape (after intersecting with proteomics): (41305, 601)


In [80]:
mri_data_shared = mri_data_shared.copy()

In [81]:
mri_data_shared['HAS_MRI'] = mri_data_shared[['26518-2.0', '26518-3.0']].bfill(axis=1).iloc[:, 0].notna()

In [82]:
total_subj = len(mri_data_shared)
with_mri_count = mri_data_shared['HAS_MRI'].sum()

In [83]:
print(f"Out of {total_subj} shared participants, {with_mri_count} have MRI data.")

Out of 41305 shared participants, 4698 have MRI data.


In [84]:
# preprocessing (MRI data)
mri_data_preprocessed = mri_data_shared[mri_data_shared['HAS_MRI']].copy()
print(f"MRI data shape after filtering: {mri_data_preprocessed.shape}")

MRI data shape after filtering: (4698, 602)


In [85]:
#Drop Columns With >50% Missing Values
threshold = 0.5 * len(mri_data_preprocessed)
mri_data_preprocessed = mri_data_preprocessed.loc[:, mri_data_preprocessed.isnull().sum() <= threshold]
print(f"Shape after removing columns with >50% missing data: {mri_data_preprocessed.shape}")

Shape after removing columns with >50% missing data: (4698, 301)


In [87]:
# Remove Outliers Using Z-Score (±3 SD)
mri_feature_cols = mri_data_preprocessed.columns.difference(['eid', 'HAS_MRI'])
z_scores = mri_data_preprocessed[mri_feature_cols].apply(zscore)  # Compute Z-score
mri_data_preprocessed[mri_feature_cols] = mri_data_preprocessed[mri_feature_cols].mask(abs(z_scores) > 3, np.nan)
print(f"Outliers removed using Z-score (set to NaN)")


Outliers removed using Z-score (set to NaN)


In [88]:
# Impute Missing Values Using Mean
imputer = SimpleImputer(strategy="mean")
mri_data_imputed = pd.DataFrame(imputer.fit_transform(mri_data_preprocessed), columns=mri_data_preprocessed.columns)
mri_data_imputed['eid'] = mri_data_preprocessed['eid'].values  # Restore eid column
print(f"Missing values imputed using column-wise mean")


Missing values imputed using column-wise mean


In [89]:
mri_data_imputed

Unnamed: 0,eid,25012-2.0,25013-2.0,25014-2.0,25015-2.0,25016-2.0,25017-2.0,25018-2.0,25019-2.0,25020-2.0,...,27320-2.0,27321-2.0,27322-2.0,27323-2.0,27324-2.0,27325-2.0,27326-2.0,27327-2.0,27328-2.0,HAS_MRI
0,1000043,7232.0,3342.0,3595.0,5108.0,4771.0,1704.0,1971.0,3283.0,3961.0,...,11092.0,2790.0,14412.0,27460.0,11672.0,16217.0,11242.0,915.0,6353.0,1.0
1,1000773,7326.0,3239.0,3514.0,5147.0,5123.0,1907.0,1983.0,4021.0,3853.0,...,12804.0,3088.0,13278.0,27148.0,13325.0,18054.0,10632.0,966.0,7236.0,1.0
2,1001606,8641.0,3666.0,4154.0,4430.0,4952.0,1689.0,1411.0,3966.0,4689.0,...,11743.0,3100.0,12231.0,33313.0,12440.0,17936.0,9731.0,996.0,7434.0,1.0
3,1003296,7181.0,3189.0,3416.0,4320.0,4592.0,1779.0,1816.0,3860.0,3903.0,...,10714.0,2972.0,11246.0,25722.0,9534.0,15631.0,9995.0,888.0,6016.0,1.0
4,1003897,6717.0,3567.0,3517.0,3833.0,4272.0,1114.0,1834.0,3456.0,3581.0,...,11405.0,2509.0,10598.0,25418.0,11603.0,15049.0,10231.0,938.0,5893.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4693,4938629,6915.0,3167.0,3029.0,4054.0,3835.0,1486.0,1544.0,3517.0,3055.0,...,10536.0,2249.0,13262.0,31694.0,10253.0,14903.0,8685.0,983.0,6546.0,1.0
4694,4938712,7333.0,2736.0,2961.0,4634.0,4548.0,1610.0,1659.0,3912.0,3842.0,...,11459.0,2406.0,10918.0,26404.0,11563.0,16986.0,11000.0,907.0,6627.0,1.0
4695,4942598,6857.0,4023.0,4211.0,5550.0,5706.0,1609.0,1782.0,3586.0,3633.0,...,11124.0,2642.0,14022.0,30926.0,11406.0,15589.0,10012.0,943.0,5934.0,1.0
4696,4942789,6918.0,3239.0,3268.0,4323.0,4205.0,1578.0,1663.0,3645.0,3645.0,...,10603.0,2657.0,11304.0,28588.0,11818.0,16217.0,10441.0,926.0,6462.0,1.0


**Rename MRI Column Names**
- Converted original **numeric UK Biobank column names** into **descriptive labels**

In [90]:
rename_map = {
    "25011-2.0":"Volume of thalamus (LH)","25011-3.0":"Volume of thalamus (LH)","25012-2.0":"Volume of thalamus (RH)","25012-3.0":"Volume of thalamus (RH)","25013-2.0":"Volume of caudate (LH)","25013-3.0":"Volume of caudate (LH)","25014-2.0":"Volume of caudate (RH)","25014-3.0":"Volume of caudate (RH)","25015-2.0":"Volume of putamen (LH)",
    "25015-3.0":"Volume of putamen (LH)","25016-2.0":"Volume of putamen (RH)","25016-3.0":"Volume of putamen (RH)","25017-2.0":"Volume of pallidum (LH)","25017-3.0":"Volume of pallidum (LH)","25018-2.0":"Volume of pallidum (RH)","25018-3.0":"Volume of pallidum (RH)","25019-2.0":"Volume of hippocampus (LH)","25019-3.0":"Volume of hippocampus (LH)",
    "25020-2.0":"Volume of hippocampus (RH)","25020-3.0":"Volume of hippocampus (RH)","25021-2.0":"Volume of amygdala (LH)","25021-3.0":"Volume of amygdala (LH)","25022-2.0":"Volume of amygdala (RH)","25022-3.0":"Volume of amygdala (RH)","25023-2.0":"Volume of accumbens (LH)","25023-3.0":"Volume of accumbens (LH)","25024-2.0":"Volume of accumbens (RH)",
    "25024-3.0":"Volume of accumbens (RH)","25781-2.0":"Total volume of white matter hyperintensities (T1/T2_FLAIR)","25781-3.0":"Total volume of white matter hyperintensities (T1/T2_FLAIR)","26501-2.0":"Mean intensity of 3rd-Ventricle (whole brain)","26501-3.0":"Mean intensity of 3rd-Ventricle (whole brain)","26502-2.0":"Mean intensity of 4th-Ventricle (whole brain)",
    "26502-3.0":"Mean intensity of 4th-Ventricle (whole brain)","26503-2.0":"Mean intensity of 5th-Ventricle (whole brain)","26503-3.0":"Mean intensity of 5th-Ventricle (whole brain)","26504-2.0":"Mean intensity of Brain-Stem (whole brain)","26504-3.0":"Mean intensity of Brain-Stem (whole brain)","26505-2.0":"Mean intensity of CSF (whole brain)",
    "26505-3.0":"Mean intensity of CSF (whole brain)","26506-2.0":"Mean intensity of WM-hypointensities (whole brain)","26506-3.0":"Mean intensity of WM-hypointensities (whole brain)","26507-2.0":"Mean intensity of non-WM-hypointensities (whole brain)","26507-3.0":"Mean intensity of non-WM-hypointensities (whole brain)",
    "26508-2.0":"Mean intensity of Optic-Chiasm (whole brain)","26508-3.0":"Mean intensity of Optic-Chiasm (whole brain)","26509-2.0":"Mean intensity of CC-Posterior (whole brain)","26509-3.0":"Mean intensity of CC-Posterior (whole brain)","26510-2.0":"Mean intensity of CC-Mid-Posterior (whole brain)","26510-3.0":"Mean intensity of CC-Mid-Posterior (whole brain)",
    "26511-2.0":"Mean intensity of CC-Central (whole brain)","26511-3.0":"Mean intensity of CC-Central (whole brain)","26512-2.0":"Mean intensity of CC-Mid-Anterior (whole brain)","26512-3.0":"Mean intensity of CC-Mid-Anterior (whole brain)","26513-2.0":"Mean intensity of CC-Anterior (whole brain)","26513-3.0":"Mean intensity of CC-Anterior (whole brain)",
    "26514-2.0":"Volume of BrainSeg (whole brain)","26514-3.0":"Volume of BrainSeg (whole brain)","26515-2.0":"Volume of BrainSegNotVent (whole brain)","26515-3.0":"Volume of BrainSegNotVent (whole brain)","26516-2.0":"Volume of BrainSegNotVentSurf (whole brain)","26516-3.0":"Volume of BrainSegNotVentSurf (whole brain)","26517-2.0":"Volume of SubCortGray (whole brain)",
    "26517-3.0":"Volume of SubCortGray (whole brain)","26518-2.0":"Volume of TotalGray (whole brain)","26518-3.0":"Volume of TG","26519-2.0":"Volume of SupraTentorial (whole brain)","26519-3.0":"Volume of SupraTentorial (whole brain)","26520-2.0":"Volume of SupraTentorialNotVent (whole brain)","26520-3.0":"Volume of SupraTentorialNotVent (whole brain)",
    "26521-2.0":"Volume of EstimatedTotalIntraCranial (whole brain)","26521-3.0":"Volume of EstimatedTotalIntraCranial (whole brain)","26522-2.0":"Volume of VentricleChoroid (whole brain)","26522-3.0":"Volume of VentricleChoroid (whole brain)","26523-2.0":"Volume of 3rd-Ventricle (whole brain)","26523-3.0":"Volume of 3rd-Ventricle (whole brain)",
    "26524-2.0":"Volume of 4th-Ventricle (whole brain)","26524-3.0":"Volume of 4th-Ventricle (whole brain)","26525-2.0":"Volume of 5th-Ventricle (whole brain)","26525-3.0":"Volume of 5th-Ventricle (whole brain)","26526-2.0":"Volume of Brain-Stem (whole brain)","26526-3.0":"Volume of Brain-Stem (whole brain)","26527-2.0":"Volume of CSF (whole brain)",
    "26527-3.0":"Volume of CSF (whole brain)","26528-2.0":"Volume of WM-hypointensities (whole brain)","26528-3.0":"Volume of WM-hypointensities (whole brain)","26529-2.0":"Volume of non-WM-hypointensities (whole brain)","26529-3.0":"Volume of non-WM-hypointensities (whole brain)","26530-2.0":"Volume of Optic-Chiasm (whole brain)",
    "26530-3.0":"Volume of Optic-Chiasm (whole brain)","26531-2.0":"Volume of CC-Posterior (whole brain)","26531-3.0":"Volume of CC-Posterior (whole brain)","26532-2.0":"Volume of CC-Mid-Posterior (whole brain)","26532-3.0":"Volume of CC-Mid-Posterior (whole brain)","26533-2.0":"Volume of CC-Central (whole brain)","26533-3.0":"Volume of CC-Central (whole brain)",
    "26534-2.0":"Volume of CC-Mid-Anterior (whole brain)","26534-3.0":"Volume of CC-Mid-Anterior (whole brain)","26535-2.0":"Volume of CC-Anterior (whole brain)","26535-3.0":"Volume of CC-Anterior (whole brain)","26536-2.0":"Volume-ratio of BrainSegVol-to-eTIV (whole brain)","26536-3.0":"Volume-ratio of BrainSegVol-to-eTIV (whole brain)",
    "26537-2.0":"Volume-ratio of MaskVol-to-eTIV (whole brain)","26537-3.0":"Volume-ratio of MaskVol-to-eTIV (whole brain)","26538-2.0":"Mean intensity of Lateral-Ventricle (LH)","26538-3.0":"Mean intensity of Lateral-Ventricle (LH)","26539-2.0":"Mean intensity of Inf-Lat-Vent (LH)","26539-3.0":"Mean intensity of Inf-Lat-Vent (LH)",
    "26540-2.0":"Mean intensity of Cerebellum-White-Matter (LH)","26540-3.0":"Mean intensity of Cerebellum-White-Matter (LH)","26541-2.0":"Mean intensity of Cerebellum-Cortex (LH)","26541-3.0":"Mean intensity of Cerebellum-Cortex (LH)","26542-2.0":"Mean intensity of Thalamus-Proper (LH)","26542-3.0":"Mean intensity of Thalamus-Proper (LH)",
    "26543-2.0":"Mean intensity of Caudate (LH)","26543-3.0":"Mean intensity of Caudate (LH)","26544-2.0":"Mean intensity of Putamen (LH)","26544-3.0":"Mean intensity of Putamen (LH)","26545-2.0":"Mean intensity of Pallidum (LH)","26545-3.0":"Mean intensity of Pallidum (LH)","26546-2.0":"Mean intensity of Hippocampus (LH)","26546-3.0":"Mean intensity of Hippocampus (LH)",
    "26547-2.0":"Mean intensity of Amygdala (LH)","26547-3.0":"Mean intensity of Amygdala (LH)","26548-2.0":"Mean intensity of Accumbens-area (LH)","26548-3.0":"Mean intensity of Accumbens-area (LH)","26549-2.0":"Mean intensity of VentralDC (LH)","26549-3.0":"Mean intensity of VentralDC (LH)","26550-2.0":"Mean intensity of vessel (LH)",
    "26550-3.0":"Mean intensity of vessel (LH)","26551-2.0":"Mean intensity of choroid-plexus (LH)","26551-3.0":"Mean intensity of choroid-plexus (LH)","26552-2.0":"Volume of Cortex (LH)","26552-3.0":"Volume of Cortex (LH)","26553-2.0":"Volume of CerebralWhiteMatter (LH)","26553-3.0":"Volume of CerebralWhiteMatter (LH)","26554-2.0":"Volume of Lateral-Ventricle (LH)",
    "26554-3.0":"Volume of Lateral-Ventricle (LH)","26555-2.0":"Volume of Inf-Lat-Vent (LH)","26555-3.0":"Volume of Inf-Lat-Vent (LH)","26556-2.0":"Volume of Cerebellum-White-Matter (LH)","26556-3.0":"Volume of Cerebellum-White-Matter (LH)","26557-2.0":"Volume of Cerebellum-Cortex (LH)","26557-3.0":"Volume of Cerebellum-Cortex (LH)",
    "26558-2.0":"Volume of Thalamus-Proper (LH)","26558-3.0":"Volume of Thalamus-Proper (LH)","26559-2.0":"Volume of Caudate (LH)","26559-3.0":"Volume of Caudate (LH)","26560-2.0":"Volume of Putamen (LH)","26560-3.0":"Volume of Putamen (LH)","26561-2.0":"Volume of Pallidum (LH)","26561-3.0":"Volume of Pallidum (LH)","26562-2.0":"Volume of Hippocampus (LH)",
    "26562-3.0":"Volume of Hippocampus (LH)","26563-2.0":"Volume of Amygdala (LH)","26563-3.0":"Volume of Amygdala (LH)","26564-2.0":"Volume of Accumbens-area (LH)","26564-3.0":"Volume of Accumbens-area (LH)","26565-2.0":"Volume of VentralDC (LH)","26565-3.0":"Volume of VentralDC (LH)","26566-2.0":"Volume of vessel (LH)","26566-3.0":"Volume of vessel (LH)",
    "26567-2.0":"Volume of choroid-plexus (LH)","26567-3.0":"Volume of choroid-plexus (LH)","26568-2.0":"Number of HolesBeforeFixing (LH)","26568-3.0":"Number of HolesBeforeFixing (LH)","27143-2.0":"Area of caudalanteriorcingulate (LH)","27143-3.0":"Area of caudalanteriorcingulate (LH)","27144-2.0":"Area of caudalmiddlefrontal (LH)",
    "27144-3.0":"Area of caudalmiddlefrontal (LH)","27145-2.0":"Area of cuneus (LH)","27145-3.0":"Area of cuneus (LH)","27146-2.0":"Area of entorhinal (LH)","27146-3.0":"Area of entorhinal (LH)","27147-2.0":"Area of fusiform (LH)","27147-3.0":"Area of fusiform (LH)","27148-2.0":"Area of inferiorparietal (LH)","27148-3.0":"Area of inferiorparietal (LH)",
    "27149-2.0":"Area of inferiortemporal (LH)","27149-3.0":"Area of inferiortemporal (LH)","27150-2.0":"Area of isthmuscingulate (LH)","27150-3.0":"Area of isthmuscingulate (LH)","27151-2.0":"Area of lateraloccipital (LH)","27151-3.0":"Area of lateraloccipital (LH)","27152-2.0":"Area of lateralorbitofrontal (LH)","27152-3.0":"Area of lateralorbitofrontal (LH)",
    "27153-2.0":"Area of lingual (LH)","27153-3.0":"Area of lingual (LH)","27154-2.0":"Area of medialorbitofrontal (LH)","27154-3.0":"Area of medialorbitofrontal (LH)","27155-2.0":"Area of middletemporal (LH)","27155-3.0":"Area of middletemporal (LH)","27156-2.0":"Area of parahippocampal (LH)","27156-3.0":"Area of parahippocampal (LH)",
    "27157-2.0":"Area of paracentral (LH)","27157-3.0":"Area of paracentral (LH)","27158-2.0":"Area of parsopercularis (LH)","27158-3.0":"Area of parsopercularis (LH)","27159-2.0":"Area of parsorbitalis (LH)","27159-3.0":"Area of parsorbitalis (LH)","27160-2.0":"Area of parstriangularis (LH)","27160-3.0":"Area of parstriangularis (LH)",
    "27161-2.0":"Area of pericalcarine (LH)","27161-3.0":"Area of pericalcarine (LH)","27162-2.0":"Area of postcentral (LH)","27162-3.0":"Area of postcentral (LH)","27163-2.0":"Area of posteriorcingulate (LH)","27163-3.0":"Area of posteriorcingulate (LH)","27164-2.0":"Area of precentral (LH)","27164-3.0":"Area of precentral (LH)","27165-2.0":"Area of precuneus (LH)",
    "27165-3.0":"Area of precuneus (LH)","27166-2.0":"Area of rostralanteriorcingulate (LH)","27166-3.0":"Area of rostralanteriorcingulate (LH)","27167-2.0":"Area of rostralmiddlefrontal (LH)","27167-3.0":"Area of rostralmiddlefrontal (LH)","27168-2.0":"Area of superiorfrontal (LH)","27168-3.0":"Area of superiorfrontal (LH)","27169-2.0":"Area of superiorparietal (LH)",
    "27169-3.0":"Area of superiorparietal (LH)","27170-2.0":"Area of superiortemporal (LH)","27170-3.0":"Area of superiortemporal (LH)","27171-2.0":"Area of supramarginal (LH)","27171-3.0":"Area of supramarginal (LH)","27172-2.0":"Area of transversetemporal (LH)","27172-3.0":"Area of transversetemporal (LH)","27173-2.0":"Area of insula (LH)","27173-3.0":"Area of insula (LH)",
    "27174-2.0":"Mean thickness of caudalanteriorcingulate (LH)","27174-3.0":"Mean thickness of caudalanteriorcingulate (LH)","27175-2.0":"Mean thickness of caudalmiddlefrontal (LH)","27175-3.0":"Mean thickness of caudalmiddlefrontal (LH)","27176-2.0":"Mean thickness of cuneus (LH)","27176-3.0":"Mean thickness of cuneus (LH)","27177-2.0":"Mean thickness of entorhinal (LH)",
    "27177-3.0":"Mean thickness of entorhinal (LH)","27178-2.0":"Mean thickness of fusiform (LH)","27178-3.0":"Mean thickness of fusiform (LH)","27179-2.0":"Mean thickness of inferiorparietal (LH)","27179-3.0":"Mean thickness of inferiorparietal (LH)","27180-2.0":"Mean thickness of inferiortemporal (LH)","27180-3.0":"Mean thickness of inferiortemporal (LH)",
    "27181-2.0":"Mean thickness of isthmuscingulate (LH)","27181-3.0":"Mean thickness of isthmuscingulate (LH)","27182-2.0":"Mean thickness of lateraloccipital (LH)","27182-3.0":"Mean thickness of lateraloccipital (LH)","27183-2.0":"Mean thickness of lateralorbitofrontal (LH)","27183-3.0":"Mean thickness of lateralorbitofrontal (LH)",
    "27184-2.0":"Mean thickness of lingual (LH)","27184-3.0":"Mean thickness of lingual (LH)","27185-2.0":"Mean thickness of medialorbitofrontal (LH)","27185-3.0":"Mean thickness of medialorbitofrontal (LH)","27186-2.0":"Mean thickness of middletemporal (LH)","27186-3.0":"Mean thickness of middletemporal (LH)","27187-2.0":"Mean thickness of parahippocampal (LH)",
    "27187-3.0":"Mean thickness of parahippocampal (LH)","27188-2.0":"Mean thickness of paracentral (LH)","27188-3.0":"Mean thickness of paracentral (LH)","27189-2.0":"Mean thickness of parsopercularis (LH)","27189-3.0":"Mean thickness of parsopercularis (LH)","27190-2.0":"Mean thickness of parsorbitalis (LH)","27190-3.0":"Mean thickness of parsorbitalis (LH)",
    "27191-2.0":"Mean thickness of parstriangularis (LH)","27191-3.0":"Mean thickness of parstriangularis (LH)","27192-2.0":"Mean thickness of pericalcarine (LH)","27192-3.0":"Mean thickness of pericalcarine (LH)","27193-2.0":"Mean thickness of postcentral (LH)","27193-3.0":"Mean thickness of postcentral (LH)","27194-2.0":"Mean thickness of posteriorcingulate (LH)",
    "27194-3.0":"Mean thickness of posteriorcingulate (LH)","27195-2.0":"Mean thickness of precentral (LH)","27195-3.0":"Mean thickness of precentral (LH)","27196-2.0":"Mean thickness of precuneus (LH)","27196-3.0":"Mean thickness of precuneus (LH)","27197-2.0":"Mean thickness of rostralanteriorcingulate (LH)","27197-3.0":"Mean thickness of rostralanteriorcingulate (LH)",
    "27198-2.0":"Mean thickness of rostralmiddlefrontal (LH)","27198-3.0":"Mean thickness of rostralmiddlefrontal (LH)","27199-2.0":"Mean thickness of superiorfrontal (LH)","27199-3.0":"Mean thickness of superiorfrontal (LH)","27200-2.0":"Mean thickness of superiorparietal (LH)","27200-3.0":"Mean thickness of superiorparietal (LH)",
    "27201-2.0":"Mean thickness of superiortemporal (LH)","27201-3.0":"Mean thickness of superiortemporal (LH)","27202-2.0":"Mean thickness of supramarginal (LH)","27202-3.0":"Mean thickness of supramarginal (LH)","27203-2.0":"Mean thickness of transversetemporal (LH)","27203-3.0":"Mean thickness of transversetemporal (LH)","27204-2.0":"Mean thickness of insula (LH)",
    "27204-3.0":"Mean thickness of insula (LH)","27205-2.0":"Volume of caudalanteriorcingulate (LH)","27205-3.0":"Volume of caudalanteriorcingulate (LH)","27206-2.0":"Volume of caudalmiddlefrontal (LH)","27206-3.0":"Volume of caudalmiddlefrontal (LH)","27207-2.0":"Volume of cuneus (LH)","27207-3.0":"Volume of cuneus (LH)","27208-2.0":"Volume of entorhinal (LH)",
    "27208-3.0":"Volume of entorhinal (LH)","27209-2.0":"Volume of fusiform (LH)","27209-3.0":"Volume of fusiform (LH)","27210-2.0":"Volume of inferiorparietal (LH)","27210-3.0":"Volume of inferiorparietal (LH)","27211-2.0":"Volume of inferiortemporal (LH)","27211-3.0":"Volume of inferiortemporal (LH)","27212-2.0":"Volume of isthmuscingulate (LH)",
    "27212-3.0":"Volume of isthmuscingulate (LH)","27213-2.0":"Volume of lateraloccipital (LH)","27213-3.0":"Volume of lateraloccipital (LH)","27214-2.0":"Volume of lateralorbitofrontal (LH)","27214-3.0":"Volume of lateralorbitofrontal (LH)","27215-2.0":"Volume of lingual (LH)","27215-3.0":"Volume of lingual (LH)","27216-2.0":"Volume of medialorbitofrontal (LH)",
    "27216-3.0":"Volume of medialorbitofrontal (LH)","27217-2.0":"Volume of middletemporal (LH)","27217-3.0":"Volume of middletemporal (LH)","27218-2.0":"Volume of parahippocampal (LH)","27218-3.0":"Volume of parahippocampal (LH)","27219-2.0":"Volume of paracentral (LH)","27219-3.0":"Volume of paracentral (LH)","27220-2.0":"Volume of parsopercularis (LH)",
    "27220-3.0":"Volume of parsopercularis (LH)","27221-2.0":"Volume of parsorbitalis (LH)","27221-3.0":"Volume of parsorbitalis (LH)","27222-2.0":"Volume of parstriangularis (LH)","27222-3.0":"Volume of parstriangularis (LH)","27223-2.0":"Volume of pericalcarine (LH)","27223-3.0":"Volume of pericalcarine (LH)","27224-2.0":"Volume of postcentral (LH)",
    "27224-3.0":"Volume of postcentral (LH)","27225-2.0":"Volume of posteriorcingulate (LH)","27225-3.0":"Volume of posteriorcingulate (LH)","27226-2.0":"Volume of precentral (LH)","27226-3.0":"Volume of precentral (LH)","27227-2.0":"Volume of precuneus (LH)","27227-3.0":"Volume of precuneus (LH)","27228-2.0":"Volume of rostralanteriorcingulate (LH)",
    "27228-3.0":"Volume of rostralanteriorcingulate (LH)","27229-2.0":"Volume of rostralmiddlefrontal (LH)","27229-3.0":"Volume of rostralmiddlefrontal (LH)","27230-2.0":"Volume of superiorfrontal (LH)","27230-3.0":"Volume of superiorfrontal (LH)","27231-2.0":"Volume of superiorparietal (LH)","27231-3.0":"Volume of superiorparietal (LH)",
    "27232-2.0":"Volume of superiortemporal (LH)","27232-3.0":"Volume of superiortemporal (LH)","27233-2.0":"Volume of supramarginal (LH)","27233-3.0":"Volume of supramarginal (LH)","27234-2.0":"Volume of transversetemporal (LH)","27234-3.0":"Volume of transversetemporal (LH)","27235-2.0":"Volume of insula (LH)","27235-3.0":"Volume of insula (LH)",
    "27236-2.0":"Area of caudalanteriorcingulate (RH)","27236-3.0":"Area of caudalanteriorcingulate (RH)","27237-2.0":"Area of caudalmiddlefrontal (RH)","27237-3.0":"Area of caudalmiddlefrontal (RH)","27238-2.0":"Area of cuneus (RH)","27238-3.0":"Area of cuneus (RH)","27239-2.0":"Area of entorhinal (RH)","27239-3.0":"Area of entorhinal (RH)",
    "27240-2.0":"Area of fusiform (RH)","27240-3.0":"Area of fusiform (RH)","27241-2.0":"Area of inferiorparietal (RH)","27241-3.0":"Area of inferiorparietal (RH)","27242-2.0":"Area of inferiortemporal (RH)","27242-3.0":"Area of inferiortemporal (RH)","27243-2.0":"Area of isthmuscingulate (RH)","27243-3.0":"Area of isthmuscingulate (RH)",
    "27244-2.0":"Area of lateraloccipital (RH)","27244-3.0":"Area of lateraloccipital (RH)","27245-2.0":"Area of lateralorbitofrontal (RH)","27245-3.0":"Area of lateralorbitofrontal (RH)","27246-2.0":"Area of lingual (RH)","27246-3.0":"Area of lingual (RH)","27247-2.0":"Area of medialorbitofrontal (RH)","27247-3.0":"Area of medialorbitofrontal (RH)",
    "27248-2.0":"Area of middletemporal (RH)","27248-3.0":"Area of middletemporal (RH)","27249-2.0":"Area of parahippocampal (RH)","27249-3.0":"Area of parahippocampal (RH)","27250-2.0":"Area of paracentral (RH)","27250-3.0":"Area of paracentral (RH)","27251-2.0":"Area of parsopercularis (RH)","27251-3.0":"Area of parsopercularis (RH)",
    "27252-2.0":"Area of parsorbitalis (RH)","27252-3.0":"Area of parsorbitalis (RH)","27253-2.0":"Area of parstriangularis (RH)","27253-3.0":"Area of parstriangularis (RH)","27254-2.0":"Area of pericalcarine (RH)","27254-3.0":"Area of pericalcarine (RH)","27255-2.0":"Area of postcentral (RH)","27255-3.0":"Area of postcentral (RH)",
    "27256-2.0":"Area of posteriorcingulate (RH)","27256-3.0":"Area of posteriorcingulate (RH)","27257-2.0":"Area of precentral (RH)","27257-3.0":"Area of precentral (RH)","27258-2.0":"Area of precuneus (RH)","27258-3.0":"Area of precuneus (RH)","27259-2.0":"Area of rostralanteriorcingulate (RH)","27259-3.0":"Area of rostralanteriorcingulate (RH)",
    "27260-2.0":"Area of rostralmiddlefrontal (RH)","27260-3.0":"Area of rostralmiddlefrontal (RH)","27261-2.0":"Area of superiorfrontal (RH)","27261-3.0":"Area of superiorfrontal (RH)","27262-2.0":"Area of superiorparietal (RH)","27262-3.0":"Area of superiorparietal (RH)","27263-2.0":"Area of superiortemporal (RH)","27263-3.0":"Area of superiortemporal (RH)",
    "27264-2.0":"Area of supramarginal (RH)","27264-3.0":"Area of supramarginal (RH)","27265-2.0":"Area of transversetemporal (RH)","27265-3.0":"Area of transversetemporal (RH)","27266-2.0":"Area of insula (RH)","27266-3.0":"Area of insula (RH)","27267-2.0":"Mean thickness of caudalanteriorcingulate (RH)","27267-3.0":"Mean thickness of caudalanteriorcingulate (RH)",
    "27268-2.0":"Mean thickness of caudalmiddlefrontal (RH)","27268-3.0":"Mean thickness of caudalmiddlefrontal (RH)","27269-2.0":"Mean thickness of cuneus (RH)","27269-3.0":"Mean thickness of cuneus (RH)","27270-2.0":"Mean thickness of entorhinal (RH)","27270-3.0":"Mean thickness of entorhinal (RH)","27271-2.0":"Mean thickness of fusiform (RH)",
    "27271-3.0":"Mean thickness of fusiform (RH)","27272-2.0":"Mean thickness of inferiorparietal (RH)","27272-3.0":"Mean thickness of inferiorparietal (RH)","27273-2.0":"Mean thickness of inferiortemporal (RH)","27273-3.0":"Mean thickness of inferiortemporal (RH)","27274-2.0":"Mean thickness of isthmuscingulate (RH)","27274-3.0":"Mean thickness of isthmuscingulate (RH)",
    "27275-2.0":"Mean thickness of lateraloccipital (RH)","27275-3.0":"Mean thickness of lateraloccipital (RH)","27276-2.0":"Mean thickness of lateralorbitofrontal (RH)","27276-3.0":"Mean thickness of lateralorbitofrontal (RH)","27277-2.0":"Mean thickness of lingual (RH)","27277-3.0":"Mean thickness of lingual (RH)","27278-2.0":"Mean thickness of medialorbitofrontal (RH)",
    "27278-3.0":"Mean thickness of medialorbitofrontal (RH)","27279-2.0":"Mean thickness of middletemporal (RH)","27279-3.0":"Mean thickness of middletemporal (RH)","27280-2.0":"Mean thickness of parahippocampal (RH)","27280-3.0":"Mean thickness of parahippocampal (RH)","27281-2.0":"Mean thickness of paracentral (RH)","27281-3.0":"Mean thickness of paracentral (RH)",
    "27282-2.0":"Mean thickness of parsopercularis (RH)","27282-3.0":"Mean thickness of parsopercularis (RH)","27283-2.0":"Mean thickness of parsorbitalis (RH)","27283-3.0":"Mean thickness of parsorbitalis (RH)","27284-2.0":"Mean thickness of parstriangularis (RH)","27284-3.0":"Mean thickness of parstriangularis (RH)","27285-2.0":"Mean thickness of pericalcarine (RH)",
    "27285-3.0":"Mean thickness of pericalcarine (RH)","27286-2.0":"Mean thickness of postcentral (RH)","27286-3.0":"Mean thickness of postcentral (RH)","27287-2.0":"Mean thickness of posteriorcingulate (RH)","27287-3.0":"Mean thickness of posteriorcingulate (RH)","27288-2.0":"Mean thickness of precentral (RH)","27288-3.0":"Mean thickness of precentral (RH)",
    "27289-2.0":"Mean thickness of precuneus (RH)","27289-3.0":"Mean thickness of precuneus (RH)","27290-2.0":"Mean thickness of rostralanteriorcingulate (RH)","27290-3.0":"Mean thickness of rostralanteriorcingulate (RH)","27291-2.0":"Mean thickness of rostralmiddlefrontal (RH)","27291-3.0":"Mean thickness of rostralmiddlefrontal (RH)",
    "27292-2.0":"Mean thickness of superiorfrontal (RH)","27292-3.0":"Mean thickness of superiorfrontal (RH)","27293-2.0":"Mean thickness of superiorparietal (RH)","27293-3.0":"Mean thickness of superiorparietal (RH)","27294-2.0":"Mean thickness of superiortemporal (RH)","27294-3.0":"Mean thickness of superiortemporal (RH)",
    "27295-2.0":"Mean thickness of supramarginal (RH)","27295-3.0":"Mean thickness of supramarginal (RH)","27296-2.0":"Mean thickness of transversetemporal (RH)","27296-3.0":"Mean thickness of transversetemporal (RH)","27297-2.0":"Mean thickness of insula (RH)","27297-3.0":"Mean thickness of insula (RH)","27298-2.0":"Volume of caudalanteriorcingulate (RH)",
    "27298-3.0":"Volume of caudalanteriorcingulate (RH)","27299-2.0":"Volume of caudalmiddlefrontal (RH)","27299-3.0":"Volume of caudalmiddlefrontal (RH)","27300-2.0":"Volume of cuneus (RH)","27300-3.0":"Volume of cuneus (RH)","27301-2.0":"Volume of entorhinal (RH)","27301-3.0":"Volume of entorhinal (RH)","27302-2.0":"Volume of fusiform (RH)",
    "27302-3.0":"Volume of fusiform (RH)","27303-2.0":"Volume of inferiorparietal (RH)","27303-3.0":"Volume of inferiorparietal (RH)","27304-2.0":"Volume of inferiortemporal (RH)","27304-3.0":"Volume of inferiortemporal (RH)","27305-2.0":"Volume of isthmuscingulate (RH)","27305-3.0":"Volume of isthmuscingulate (RH)","27306-2.0":"Volume of lateraloccipital (RH)",
    "27306-3.0":"Volume of lateraloccipital (RH)","27307-2.0":"Volume of lateralorbitofrontal (RH)","27307-3.0":"Volume of lateralorbitofrontal (RH)","27308-2.0":"Volume of lingual (RH)","27308-3.0":"Volume of lingual (RH)","27309-2.0":"Volume of medialorbitofrontal (RH)","27309-3.0":"Volume of medialorbitofrontal (RH)","27310-2.0":"Volume of middletemporal (RH)",
    "27310-3.0":"Volume of middletemporal (RH)","27311-2.0":"Volume of parahippocampal (RH)","27311-3.0":"Volume of parahippocampal (RH)","27312-2.0":"Volume of paracentral (RH)","27312-3.0":"Volume of paracentral (RH)","27313-2.0":"Volume of parsopercularis (RH)","27313-3.0":"Volume of parsopercularis (RH)","27314-2.0":"Volume of parsorbitalis (RH)",
    "27314-3.0":"Volume of parsorbitalis (RH)","27315-2.0":"Volume of parstriangularis (RH)","27315-3.0":"Volume of parstriangularis (RH)","27316-2.0":"Volume of pericalcarine (RH)","27316-3.0":"Volume of pericalcarine (RH)","27317-2.0":"Volume of postcentral (RH)","27317-3.0":"Volume of postcentral (RH)","27318-2.0":"Volume of posteriorcingulate (RH)",
    "27318-3.0":"Volume of posteriorcingulate (RH)","27319-2.0":"Volume of precentral (RH)","27319-3.0":"Volume of precentral (RH)","27320-2.0":"Volume of precuneus (RH)","27320-3.0":"Volume of precuneus (RH)","27321-2.0":"Volume of rostralanteriorcingulate (RH)","27321-3.0":"Volume of rostralanteriorcingulate (RH)","27322-2.0":"Volume of rostralmiddlefrontal (RH)",
    "27322-3.0":"Volume of rostralmiddlefrontal (RH)","27323-2.0":"Volume of superiorfrontal (RH)","27323-3.0":"Volume of superiorfrontal (RH)","27324-2.0":"Volume of superiorparietal (RH)","27324-3.0":"Volume of superiorparietal (RH)","27325-2.0":"Volume of superiortemporal (RH)","27325-3.0":"Volume of superiortemporal (RH)","27326-2.0":"Volume of supramarginal (RH)",
    "27326-3.0":"Volume of supramarginal (RH)","27327-2.0":"Volume of transversetemporal (RH)","27327-3.0":"Volume of transversetemporal (RH)","27328-2.0":"Volume of insula (RH)","27328-3.0":"Volume of insula (RH)"}


In [91]:
mri_data_imputed.rename(columns=rename_map, inplace=True)

In [92]:
mri_data_imputed

Unnamed: 0,eid,Volume of thalamus (RH),Volume of caudate (LH),Volume of caudate (RH),Volume of putamen (LH),Volume of putamen (RH),Volume of pallidum (LH),Volume of pallidum (RH),Volume of hippocampus (LH),Volume of hippocampus (RH),...,Volume of precuneus (RH),Volume of rostralanteriorcingulate (RH),Volume of rostralmiddlefrontal (RH),Volume of superiorfrontal (RH),Volume of superiorparietal (RH),Volume of superiortemporal (RH),Volume of supramarginal (RH),Volume of transversetemporal (RH),Volume of insula (RH),HAS_MRI
0,1000043,7232.0,3342.0,3595.0,5108.0,4771.0,1704.0,1971.0,3283.0,3961.0,...,11092.0,2790.0,14412.0,27460.0,11672.0,16217.0,11242.0,915.0,6353.0,1.0
1,1000773,7326.0,3239.0,3514.0,5147.0,5123.0,1907.0,1983.0,4021.0,3853.0,...,12804.0,3088.0,13278.0,27148.0,13325.0,18054.0,10632.0,966.0,7236.0,1.0
2,1001606,8641.0,3666.0,4154.0,4430.0,4952.0,1689.0,1411.0,3966.0,4689.0,...,11743.0,3100.0,12231.0,33313.0,12440.0,17936.0,9731.0,996.0,7434.0,1.0
3,1003296,7181.0,3189.0,3416.0,4320.0,4592.0,1779.0,1816.0,3860.0,3903.0,...,10714.0,2972.0,11246.0,25722.0,9534.0,15631.0,9995.0,888.0,6016.0,1.0
4,1003897,6717.0,3567.0,3517.0,3833.0,4272.0,1114.0,1834.0,3456.0,3581.0,...,11405.0,2509.0,10598.0,25418.0,11603.0,15049.0,10231.0,938.0,5893.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4693,4938629,6915.0,3167.0,3029.0,4054.0,3835.0,1486.0,1544.0,3517.0,3055.0,...,10536.0,2249.0,13262.0,31694.0,10253.0,14903.0,8685.0,983.0,6546.0,1.0
4694,4938712,7333.0,2736.0,2961.0,4634.0,4548.0,1610.0,1659.0,3912.0,3842.0,...,11459.0,2406.0,10918.0,26404.0,11563.0,16986.0,11000.0,907.0,6627.0,1.0
4695,4942598,6857.0,4023.0,4211.0,5550.0,5706.0,1609.0,1782.0,3586.0,3633.0,...,11124.0,2642.0,14022.0,30926.0,11406.0,15589.0,10012.0,943.0,5934.0,1.0
4696,4942789,6918.0,3239.0,3268.0,4323.0,4205.0,1578.0,1663.0,3645.0,3645.0,...,10603.0,2657.0,11304.0,28588.0,11818.0,16217.0,10441.0,926.0,6462.0,1.0


In [93]:
mri_data_imputed.to_csv("mri_preprocessed_for_4698.csv", index=False)
print("Final MRI data saved")

Final MRI data saved


**Merge Proteomics & MRI Data**
- **Used a left join** to keep all **52,700 participants**.
- **Added `Imaging` column** for all participants:
  - **1** → If MRI data exists.
  - **0** → If MRI data is missing.
- **Final dataset saved** as `merged_proteomics_mri_all.csv`.

In [94]:
merged_data = proteomics_df.merge(mri_data_imputed, on="eid", how="left")

In [97]:
merged_data["Imaging"] = merged_data["HAS_MRI"].fillna(0).astype(int)
merged_data

Unnamed: 0,eid,Sex,Ethnic_background,Age_at_recruitment,Diagnoses_main_ICD10,Diagnoses_main_ICD10_1,Diagnoses_main_ICD10_2,Diagnoses_main_ICD10_3,Diagnoses_main_ICD10_4,Diagnoses_main_ICD10_5,...,Volume of rostralmiddlefrontal (RH),Volume of superiorfrontal (RH),Volume of superiorparietal (RH),Volume of superiortemporal (RH),Volume of supramarginal (RH),Volume of transversetemporal (RH),Volume of insula (RH),HAS_MRI,HAS_MEI,Imaging
0,1000024,0,1001.0,67,F019,G309,I48,I620,I639,M169,...,,,,,,,,,0,0
1,1000043,1,1001.0,65,,,,,,,...,14412.0,27460.0,11672.0,16217.0,11242.0,915.0,6353.0,1.0,1,1
2,1000156,0,1001.0,62,E871,H258,H269,R074,,,...,,,,,,,,,0,0
3,1000217,1,1003.0,63,C060,I269,R509,R69,,,...,,,,,,,,,0,0
4,1000309,1,4002.0,60,,,,,,,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,6023140,0,1001.0,55,H264,H269,H521,I259,I319,M201,...,,,,,,,,,0,0
52696,6023206,1,2004.0,64,C447,D509,E831,G562,I839,K219,...,,,,,,,,,0,0
52697,6023457,1,1001.0,48,D125,I841,I848,K621,K640,Q433,...,,,,,,,,,0,0
52698,6023548,1,1001.0,62,C155,C159,C160,C675,C679,K918,...,,,,,,,,,0,0


In [98]:
merged_data = merged_data.drop(columns=['HAS_MRI', 'HAS_MEI'], errors='ignore')

In [99]:
merged_data

Unnamed: 0,eid,Sex,Ethnic_background,Age_at_recruitment,Diagnoses_main_ICD10,Diagnoses_main_ICD10_1,Diagnoses_main_ICD10_2,Diagnoses_main_ICD10_3,Diagnoses_main_ICD10_4,Diagnoses_main_ICD10_5,...,Volume of precuneus (RH),Volume of rostralanteriorcingulate (RH),Volume of rostralmiddlefrontal (RH),Volume of superiorfrontal (RH),Volume of superiorparietal (RH),Volume of superiortemporal (RH),Volume of supramarginal (RH),Volume of transversetemporal (RH),Volume of insula (RH),Imaging
0,1000024,0,1001.0,67,F019,G309,I48,I620,I639,M169,...,,,,,,,,,,0
1,1000043,1,1001.0,65,,,,,,,...,11092.0,2790.0,14412.0,27460.0,11672.0,16217.0,11242.0,915.0,6353.0,1
2,1000156,0,1001.0,62,E871,H258,H269,R074,,,...,,,,,,,,,,0
3,1000217,1,1003.0,63,C060,I269,R509,R69,,,...,,,,,,,,,,0
4,1000309,1,4002.0,60,,,,,,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52695,6023140,0,1001.0,55,H264,H269,H521,I259,I319,M201,...,,,,,,,,,,0
52696,6023206,1,2004.0,64,C447,D509,E831,G562,I839,K219,...,,,,,,,,,,0
52697,6023457,1,1001.0,48,D125,I841,I848,K621,K640,Q433,...,,,,,,,,,,0
52698,6023548,1,1001.0,62,C155,C159,C160,C675,C679,K918,...,,,,,,,,,,0


In [101]:
merged_data.to_csv("merged_proteomics_mri_all.csv", index=False)
print(f"Final merged dataset saved {merged_data.shape}")

Final merged dataset saved (52700, 3776)
