In [2]:
import pandas as pd

prot_df = pd.read_csv('data/train_proteins.csv')
pivoted = prot_df.pivot(index="visit_id", columns="UniProt", values="NPX")

In [4]:
pivoted.columns

Index(['O00391', 'O00533', 'O00584', 'O14498', 'O14773', 'O14791', 'O15240',
       'O15394', 'O43505', 'O60888',
       ...
       'Q9HDC9', 'Q9NQ79', 'Q9NYU2', 'Q9UBR2', 'Q9UBX5', 'Q9UHG2', 'Q9UKV8',
       'Q9UNU6', 'Q9Y646', 'Q9Y6R7'],
      dtype='object', name='UniProt', length=227)

In [15]:
import numpy as np



# Normalize each column separately
for col in pivoted.columns:

    col_data = pivoted[col].dropna() #ignore nans

    mean = col_data.mean()
    std = col_data.std(ddof=1)
    
    pivoted[col] = (pivoted[col] - mean) / std


In [34]:
import pandas as pd

# Split the visit_id column on "_"
new_cols = pivoted['visit_id'].str.split("_", expand=True)
new_cols.columns = ['patient_id', 'visit_month']

# Add the new columns to the DataFrame
prot_df = pivoted.assign(patient_id=new_cols['patient_id'], visit_month=new_cols['visit_month'])

# Print the updated DataFrame
print(prot_df.head())


UniProt  visit_id    O00391    O00533    O00584    O14498    O14773    O14791   
0         10053_0 -0.900600 -0.461720       NaN       NaN -1.470450 -0.444028  \
1        10053_12 -0.417843 -0.320609       NaN       NaN       NaN       NaN   
2        10053_18  0.566004 -0.016030 -1.810210 -0.329170       NaN -0.553583   
3        10138_12  0.340410 -0.070349 -1.619825 -0.013312  0.672284  2.636155   
4        10138_24  0.128482  0.046548 -2.055741 -1.197716  1.594137 -0.297542   

UniProt    O15240    O15394    O43505  ...    Q9NYU2    Q9UBR2    Q9UBX5   
0       -0.593125 -1.667750 -0.354558  ... -0.731008       NaN -0.987070  \
1        0.975488 -1.668428 -0.392920  ...       NaN       NaN -0.551562   
2        0.004869 -1.612750 -0.344776  ... -0.032686       NaN  0.189582   
3        0.414596  0.176453  0.105514  ...  1.809923 -0.127163  0.110313   
4        0.343887  0.073175  0.568006  ...  2.720630  0.401529 -0.790307   

UniProt    Q9UHG2    Q9UKV8    Q9UNU6    Q9Y646    Q9Y6R

In [46]:
prot_df.to_csv('data/protein_data_normalized.csv', index=False)

In [71]:
prot_df.head()

UniProt,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7,patient_id,visit_month
0,10053_0,-0.9006,-0.46172,,,-1.47045,-0.444028,-0.593125,-1.66775,-0.354558,...,-0.731008,,-0.98707,-0.509656,-0.155142,-0.66567,,-0.177409,10053,0
1,10053_12,-0.417843,-0.320609,,,,,0.975488,-1.668428,-0.39292,...,,,-0.551562,-0.571055,-0.164081,,-1.782304,0.44586,10053,12
2,10053_18,0.566004,-0.01603,-1.81021,-0.32917,,-0.553583,0.004869,-1.61275,-0.344776,...,-0.032686,,0.189582,0.150555,-0.383094,-1.363066,,0.763531,10053,18
3,10138_12,0.34041,-0.070349,-1.619825,-0.013312,0.672284,2.636155,0.414596,0.176453,0.105514,...,1.809923,-0.127163,0.110313,-0.006194,1.41226,0.966993,0.262118,-1.238675,10138,12
4,10138_24,0.128482,0.046548,-2.055741,-1.197716,1.594137,-0.297542,0.343887,0.073175,0.568006,...,2.72063,0.401529,-0.790307,-0.522044,-0.402934,-1.084349,-0.806772,-1.376184,10138,24


In [37]:


patient_dict = {}
for patient_id in prot_df.patient_id.unique():
    patient_dict[patient_id] = prot_df.groupby('patient_id').get_group(patient_id)

all_months = prot_df['visit_month'].unique()
all_patients = patient_dict.keys()

In [137]:
import pandas as pd
all_months_patient_dict = {}
# assuming patient_dict is a dictionary of patient DataFrames
for patient_id, patient_df in patient_dict.items():
    # create a new dataframe for each patient, with all_months as the index and the columns from the patient_df
    df = pd.DataFrame(index=all_months, columns=patient_df.columns)
    
    # iterate over the rows of the patient_df
    for _, row in patient_df.iterrows():
        # get the month number from the 'visit_month' column
        month_num = row['visit_month']  
        
        # check if the month is in the patient's proteomics dataframe
        if month_num in patient_df['visit_month'].values:
            # get the proteomics data for that month and overwrite the corresponding row in the new df
            prot_data = patient_df.loc[patient_df['visit_month'] == month_num].iloc[0]
            df.loc[month_num] = prot_data
        
    # Convert the index to integers
    df.index = df.index.astype(int)

    # Sort the DataFrame by index
    df = df.sort_index()

    # drop 'visit_id' and 'patient_id' columns
    df = df.drop(['visit_id', 'patient_id', 'visit_month'], axis=1)

    # convert the remaining columns to float
    df = df.astype(float)

    # assign the new dataframe to the patient_df variable
    all_months_patient_dict[patient_id] = df


In [139]:
for patient_id in all_months_patient_dict.keys():
    all_months_patient_dict[patient_id] = (all_months_patient_dict[patient_id]
                                .interpolate(method='linear', axis=0, limit_direction='both')
                                .fillna(method='ffill')
                                .fillna(method='bfill')
    )

In [140]:
all_months_patient_dict['942']

UniProt,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
0,-0.150005,-0.47361,-0.553428,0.471806,-1.606781,-0.385779,0.472508,-0.343764,-0.377169,-1.035866,...,-0.205975,-0.430813,-0.749259,0.486928,-1.122503,0.754062,0.478367,0.652694,-0.159065,-0.528789
3,-0.150005,-0.47361,-0.553428,0.471806,-1.606781,-0.385779,0.472508,-0.343764,-0.377169,-1.035866,...,-0.205975,-0.430813,-0.749259,0.486928,-1.122503,0.754062,0.478367,0.652694,-0.159065,-0.528789
6,-0.150005,-0.47361,-0.553428,0.471806,-1.606781,-0.385779,0.472508,-0.343764,-0.377169,-1.035866,...,-0.205975,-0.430813,-0.749259,0.486928,-1.122503,0.754062,0.478367,0.652694,-0.159065,-0.528789
12,-1.733737,-0.637608,-0.760188,-1.485313,0.127695,-1.122628,-0.540278,-0.609492,0.207333,-0.773126,...,-1.757402,-0.10518,-0.738786,-0.397118,-1.120484,0.041569,0.481254,-0.085353,-0.350174,-0.806277
18,-1.387946,-0.654864,-0.551536,-0.97429,-0.00556,-1.225442,-0.348352,-0.647973,0.042446,-0.784634,...,-1.254037,-0.205436,-0.699456,-0.624012,-1.02926,0.152382,0.37567,-0.086536,-0.231358,-0.696717
24,-1.042154,-0.672121,-0.342884,-0.463266,-0.138815,-1.328256,-0.156427,-0.686454,-0.12244,-0.796141,...,-0.750672,-0.305691,-0.660126,-0.850905,-0.938037,0.263195,0.270085,-0.087719,-0.112542,-0.587157
30,-0.696363,-0.814798,-0.367433,-0.346156,0.128486,-1.144534,-0.30548,-0.779197,-0.198534,-0.878333,...,-0.707056,-0.314372,-0.304866,-0.279434,-1.1198,0.183448,0.526719,0.011639,-0.400148,-0.531398
36,-0.350571,-0.957474,-0.391982,-0.229046,0.395787,-0.960812,-0.454533,-0.87194,-0.274627,-0.960526,...,-0.663439,-0.323053,0.050394,0.292038,-1.301563,0.103702,0.783352,0.110997,-0.687754,-0.47564
48,-0.00478,-1.100151,-0.416531,-0.111936,0.663088,-0.77709,-0.603586,-0.964683,-0.350721,-1.042718,...,-0.619822,-0.331734,0.405654,0.86351,-1.483325,0.023955,1.039986,0.210354,-0.97536,-0.419882
54,-0.00478,-1.100151,-0.416531,-0.111936,0.663088,-0.77709,-0.603586,-0.964683,-0.350721,-1.042718,...,-0.619822,-0.331734,0.405654,0.86351,-1.483325,0.023955,1.039986,0.210354,-0.97536,-0.419882


In [141]:
import pandas as pd
import numpy as np

# Create an empty DataFrame to store the preprocessed data
preprocessed_data = pd.DataFrame()

# Iterate through the all_months_patient_dict dictionary
for patient_id, data in all_months_patient_dict.items():
    # Set the index of the DataFrame to 'visit_month' for easier access to data based on the month
    data = data.reset_index().rename(columns={'index': 'visit_month'})

    # Iterate through the rows of the patient's data
    for visit_month, row in data.iterrows():
        # Create a new row for the preprocessed_data DataFrame with the current proteomics data
        new_row = {
            'patient_id': patient_id,
            'visit_month': visit_month,
        }
        new_row.update(row.to_dict())  # Add proteomics data to the new_row

        # Iterate through the list of offsets [6, 12, 24] to find the proteomics data at +6, +12, and +24 months
        for offset in [6, 12, 24]:
            # Calculate the target month by adding the offset to the current visit_month
            target_month = visit_month + offset

            # Check if the target_month exists in the patient's data
            if target_month in data['visit_month'].values:
                # If it exists, get the proteomics data at the target_month
                target_row = data.loc[data['visit_month'] == target_month].iloc[0]

                # Add the proteomics data at the target_month to the new_row with suffix '_{offset}m'
                for col in target_row.index:
                    if col != 'visit_month':
                        new_row[f'{col}_{offset}m'] = target_row[col]
            else:
                # If the target_month does not exist, set the corresponding proteomics data to NaN
                for col in data.columns:
                    if col != 'visit_month':
                        new_row[f'{col}_{offset}m'] = np.nan

        # Append the new_row to the preprocessed_data DataFrame
        new_row_df = pd.DataFrame(new_row, index=[0])
        preprocessed_data = pd.concat([preprocessed_data, new_row_df], ignore_index=True)


In [142]:
preprocessed_data

Unnamed: 0,patient_id,visit_month,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,...,Q9HDC9_24m,Q9NQ79_24m,Q9NYU2_24m,Q9UBR2_24m,Q9UBX5_24m,Q9UHG2_24m,Q9UKV8_24m,Q9UNU6_24m,Q9Y646_24m,Q9Y6R7_24m
0,10053,0.0,-0.900600,-0.461720,-1.810210,-0.329170,-1.470450,-0.444028,-0.593125,-1.667750,...,0.273013,0.170261,-0.032686,,0.189582,0.150555,-0.383094,-1.363066,-1.782304,0.763531
1,10053,3.0,-0.739681,-0.414683,-1.810210,-0.329170,-1.470450,-0.471417,-0.070254,-1.667976,...,,,,,,,,,,
2,10053,6.0,-0.578762,-0.367646,-1.810210,-0.329170,-1.470450,-0.498806,0.452617,-1.668202,...,,,,,,,,,,
3,10053,12.0,-0.417843,-0.320609,-1.810210,-0.329170,-1.470450,-0.526195,0.975488,-1.668428,...,,,,,,,,,,
4,10053,18.0,0.566004,-0.016030,-1.810210,-0.329170,-1.470450,-0.553583,0.004869,-1.612750,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3715,942,60.0,-0.004780,-1.100151,-0.416531,-0.111936,0.663088,-0.777090,-0.603586,-0.964683,...,,,,,,,,,,
3716,942,72.0,-0.004780,-1.100151,-0.416531,-0.111936,0.663088,-0.777090,-0.603586,-0.964683,...,,,,,,,,,,
3717,942,84.0,-0.004780,-1.100151,-0.416531,-0.111936,0.663088,-0.777090,-0.603586,-0.964683,...,-0.663439,-0.323053,0.050394,0.292038,-1.301563,0.103702,0.783352,0.110997,-0.687754,-0.475640
3718,942,96.0,-0.004780,-1.100151,-0.416531,-0.111936,0.663088,-0.777090,-0.603586,-0.964683,...,,,,,,,,,,


In [146]:
preprocessed_data.to_csv('data/preprocessed_data.csv', index=False)
