# Load Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data science
import math
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.multitest import multipletests as mt

# Plots
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

# Working with dates
from datetime import datetime
from datetime import date
import dateutil

# Looping  progress
from tqdm.notebook import tqdm

# Reg expressions
import re

# Pretty table printing
import tabulate

# Misc libraries
from IPython.display import display, HTML
import os

# Set seaborn figure size, font size, and style
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)
sns.set_style("white")

# Set Pandas options so we can see our entire dataframe
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = None

# Print our versions of this packages, this allows us to make sure
# we have the working versions we need. 
print(f"Pandas version: {pd.__version__}")

In [None]:
# Set our work dir, all paths used will be relative to this directory.
os.chdir("/data/pathogen_ncd")

UKB_DAT_HOME = "/data/pathogen_ncd"

# Antibody Data Processing

In [None]:
# Note that antibody titer data may be referred to as viral
# or antigen as well as antibody, but I've tried to update
# references to use the more correct terminilogy of antibody.

In [None]:
# Read in raw antibody data, swap the UKB column names out
# for friendlier names
#    Apostrophe from Kaposi's was removed
# Change the pat id to an int and set it as index
# antibody_titers_raw.tsv is a matrix with a header row listing antibody IDs,
# and an initial column representing UKB patient IDs, and then each cell
# represents the Ab titer value as identified by that column header for 
# that row's patient.
# Dimensions: [502,422 x 91]
# Example:
# Pat ID   HSV-1 IgG    HSV-2 IgG ...
# 0001
# 0002        54           67
# 0003 

dat = pd.read_csv('other/antibody_titers_raw.tsv', sep = '\t')
dat = dat.rename(columns = {
    '23000-0.0' : '1gG antigen for Herpes Simplex virus-1_init',
    '23000-1.0' : '1gG antigen for Herpes Simplex virus-1_sec',
    '23001-0.0' : '2mgG unique antigen for Herpes Simplex virus-2_init',
    '23001-1.0' : '2mgG unique antigen for Herpes Simplex virus-2_sec',
    '23002-0.0' : 'gE / gI antigen for Varicella Zoster Virus_init',
    '23002-1.0' : 'gE / gI antigen for Varicella Zoster Virus_sec',
    '23003-0.0' : 'VCA p18 antigen for Epstein-Barr Virus_init',
    '23003-1.0' : 'VCA p18 antigen for Epstein-Barr Virus_sec',
    '23004-0.0' : 'EBNA-1 antigen for Epstein-Barr Virus_init',
    '23004-1.0' : 'EBNA-1 antigen for Epstein-Barr Virus_sec',
    '23005-0.0' : 'ZEBRA antigen for Epstein-Barr Virus_init',
    '23005-1.0' : 'ZEBRA antigen for Epstein-Barr Virus_sec',
    '23006-0.0' : 'EA-D antigen for Epstein-Barr Virus_init',
    '23006-1.0' : 'EA-D antigen for Epstein-Barr Virus_sec',
    '23007-0.0' : 'pp150 Nter antigen for Human Cytomegalovirus_init',
    '23007-1.0' : 'pp150 Nter antigen for Human Cytomegalovirus_sec',
    '23008-0.0' : 'pp 52 antigen for Human Cytomegalovirus_init',
    '23008-1.0' : 'pp 52 antigen for Human Cytomegalovirus_sec',
    '23009-0.0' : 'pp 28 antigen for Human Cytomegalovirus_init',
    '23009-1.0' : 'pp 28 antigen for Human Cytomegalovirus_sec',
    '23010-0.0' : 'IE1A antigen for Human Herpesvirus-6_init',
    '23010-1.0' : 'IE1A antigen for Human Herpesvirus-6_sec',
    '23011-0.0' : 'IE1B antigen for Human Herpesvirus-6_init',
    '23011-1.0' : 'IE1B antigen for Human Herpesvirus-6_sec',
    '23012-0.0' : 'p101 k antigen for Human Herpesvirus-6_init',
    '23012-1.0' : 'p101 k antigen for Human Herpesvirus-6_sec',
    '23013-0.0' : 'U14 antigen for Human Herpesvirus-7_init',
    '23013-1.0' : 'U14 antigen for Human Herpesvirus-7_sec',
    '23014-0.0' : 'LANA antigen for Kaposis Sarcoma-Associated Herpesvirus_init',
    '23014-1.0' : 'LANA antigen for Kaposis Sarcoma-Associated Herpesvirus_sec',
    '23015-0.0' : 'K8.1 antigen for Kaposis Sarcoma-Associated Herpesvirus_init',
    '23015-1.0' : 'K8.1 antigen for Kaposis Sarcoma-Associated Herpesvirus_sec',
    '23016-0.0' : 'HBc antigen for Hepatitis B Virus_init',
    '23016-1.0' : 'HBc antigen for Hepatitis B Virus_sec',
    '23017-0.0' : 'HBe antigen for Hepatitis B Virus_init',
    '23017-1.0' : 'HBe antigen for Hepatitis B Virus_sec',
    '23018-0.0' : 'Core antigen for Hepatitis C Virus_init',
    '23018-1.0' : 'Core antigen for Hepatitis C Virus_sec',
    '23019-0.0' : 'NS3 antigen for Hepatitis C Virus_init',
    '23019-1.0' : 'NS3 antigen for Hepatitis C Virus_sec',
    '23020-0.0' : 'p22 antigen for Toxoplasma gondii_init',
    '23020-1.0' : 'p22 antigen for Toxoplasma gondii_sec',
    '23021-0.0' : 'sag1 antigen for Toxoplasma gondii_init',
    '23021-1.0' : 'sag1 antigen for Toxoplasma gondii_sec',
    '23022-0.0' : 'HTLV-1 gag antigen for Human T-Lymphotropic Virus 1_init',
    '23022-1.0' : 'HTLV-1 gag antigen for Human T-Lymphotropic Virus 1_sec',
    '23023-0.0' : 'HTLV-1 env antigen for Human T-Lymphotropic Virus 1_init',
    '23023-1.0' : 'HTLV-1 env antigen for Human T-Lymphotropic Virus 1_sec',
    '23024-0.0' : 'HIV-1 gag antigen for Human Immunodeficiency Virus_init',
    '23024-1.0' : 'HIV-1 gag antigen for Human Immunodeficiency Virus_sec',
    '23025-0.0' : 'HIV-1 env antigen for Human Immunodeficiency Virus_init',
    '23025-1.0' : 'HIV-1 env antigen for Human Immunodeficiency Virus_sec',
    '23026-0.0' : 'BK VP1 antigen for Human Polyomavirus BKV_init',
    '23026-1.0' : 'BK VP1 antigen for Human Polyomavirus BKV_sec',
    '23027-0.0' : 'JC VP1 antigen for Human Polyomavirus JCV_init',
    '23027-1.0' : 'JC VP1 antigen for Human Polyomavirus JCV_sec',
    '23028-0.0' : 'MC VP1 antigen for Merkel Cell Polyomavirus_init',
    '23028-1.0' : 'MC VP1 antigen for Merkel Cell Polyomavirus_sec',
    '23029-0.0' : 'L1 antigen for Human Papillomavirus type-16_init',
    '23029-1.0' : 'L1 antigen for Human Papillomavirus type-16_sec',
    '23030-0.0' : 'E6 antigen for Human Papillomavirus type-16_init',
    '23030-1.0' : 'E6 antigen for Human Papillomavirus type-16_sec',
    '23031-0.0' : 'E7 antigen for Human Papillomavirus type-16_init',
    '23031-1.0' : 'E7 antigen for Human Papillomavirus type-16_sec',
    '23032-0.0' : 'L1 antigen for Human Papillomavirus type-18_init',
    '23032-1.0' : 'L1 antigen for Human Papillomavirus type-18_sec',
    '23033-0.0' : 'momp D antigen for Chlamydia trachomatis_init',
    '23033-1.0' : 'momp D antigen for Chlamydia trachomatis_sec',
    '23034-0.0' : 'momp A antigen for Chlamydia trachomatis_init',
    '23034-1.0' : 'momp A antigen for Chlamydia trachomatis_sec',
    '23035-0.0' : 'tarp-D F1 antigen for Chlamydia trachomatis_init',
    '23035-1.0' : 'tarp-D F1 antigen for Chlamydia trachomatis_sec',
    '23036-0.0' : 'tarp-D F2 antigen for Chlamydia trachomatis_init',
    '23036-1.0' : 'tarp-D F2 antigen for Chlamydia trachomatis_sec',
    '23037-0.0' : 'PorB antigen for Chlamydia trachomatis_init',
    '23037-1.0' : 'PorB antigen for Chlamydia trachomatis_sec',
    '23038-0.0' : 'pGP3 antigen for Chlamydia trachomatis_init',
    '23038-1.0' : 'pGP3 antigen for Chlamydia trachomatis_sec',
    '23039-0.0' : 'CagA antigen for Helicobacter pylori_init',
    '23039-1.0' : 'CagA antigen for Helicobacter pylori_sec',
    '23040-0.0' : 'VacA antigen for Helicobacter pylori_init',
    '23040-1.0' : 'VacA antigen for Helicobacter pylori_sec',
    '23041-0.0' : 'OMP antigen for Helicobacter pylori_init',
    '23041-1.0' : 'OMP antigen for Helicobacter pylori_sec',
    '23042-0.0' : 'GroEL antigen for Helicobacter pylori_init',
    '23042-1.0' : 'GroEL antigen for Helicobacter pylori_sec',
    '23043-0.0' : 'Catalase antigen for Helicobacter pylori_init',
    '23043-1.0' : 'Catalase antigen for Helicobacter pylori_sec',
    '23044-0.0' : 'UreA antigen for Helicobacter pylori_init',
    '23044-1.0' : 'UreA antigen for Helicobacter pylori_sec'
})
dat.loc[:,'eid'] = dat.loc[:,'eid'].astype(int)
dat = dat.set_index('eid')

## Look at the data we have

In [None]:
# get number of NAs in each column
print("Nan in each columns" , dat.isnull().sum().value_counts(), sep='\n')
print("=============================================================")
print("Nan in each columns" , dat.isnull().sum(), sep='\n')

In [None]:
# Select only the people that don't have NaNs for Ab 
# data (using HSV1 IgG as a simple indicator)
ant_dat = dat.loc[dat['1gG antigen for Herpes Simplex virus-1_init'].notnull()]

In [None]:
# Expected output from cell:
# Before Removing 'date': 90
# After Removing 'date': 90
# Before Removing '_sec': 90
# After Removing '_sec': 45
# Before Removing 'Antigen assay': 45
# After Removing 'Antigen assay': 45

# Remove extra columns we don't need for now.
print(f"Before Removing 'date': {ant_dat.shape[1]}")
ant_dat = ant_dat[ant_dat.columns.drop(list(ant_dat.filter(regex='date')))]
print(f"After Removing 'date': {ant_dat.shape[1]}")

print(f"Before Removing '_sec': {ant_dat.shape[1]}")
ant_dat = ant_dat[ant_dat.columns.drop(list(ant_dat.filter(regex='_sec')))]
print(f"After Removing '_sec': {ant_dat.shape[1]}")

print(f"Before Removing 'Antigen assay': {ant_dat.shape[1]}")
ant_dat = ant_dat[ant_dat.columns.drop(list(ant_dat.filter(regex='Antigen assay')))]
print(f"After Removing 'Antigen assay': {ant_dat.shape[1]}")

In [None]:
# get number of NAs in each column
# All should be 0 now except H. pylori CagA, which was only measured
# for half of the participants (see Mentzer et al. paper)
print("Nan in each columns" , ant_dat.isnull().sum().value_counts(), sep='\n')
print("=============================================================")
print("Nan in each columns" , ant_dat.isnull().sum(), sep='\n')

## Now write out cleaned results!

In [None]:
# Outputting a matrix of the same format as the input
# but limited to only rows for patients that have Ab results
# and the columns for each initial Ab measurement.
# Dimensions: [9,429 x 46]
# Example:
# Pat ID   HSV-1 IgG    HSV-2 IgG ...
# 0002        54           67
ant_dat.to_csv('./procd/clean_antigen_data.csv', index = True)

In [None]:
# Get list of particpant IDs for people with antibody data, 
# otherwise we won't read their data in.
ant_peeps = ant_dat.index.to_list()

## Remove withdrawn participants

In [None]:
# Read in latest file from UKB
pats_to_rm_fn = f'{UKB_DAT_HOME}/other/participants_to_remove_as_of_05_03_23.csv'
pats_to_rm = pd.read_csv(pats_to_rm_fn, names = ['eid'])
pats_to_rm_ls = pats_to_rm['eid'].tolist()

# 161 
print(len(pats_to_rm_ls))

In [None]:
# Read files back in
ant_dat = pd.read_csv('./procd/clean_antigen_data.csv')

# Most of the data was pre-cleaned, so we only have 1 person in data
# here that we need to remove
# Dimensions: [9430 x 46]
print(ant_dat.shape)


# Remove them
ant_dat = ant_dat.loc[~ant_dat['eid'].isin(pats_to_rm_ls), :]

# Dimensions: [9429 x 46]
print(ant_dat.shape)

# Write the updated file out
ant_dat.to_csv('./procd/clean_antigen_data.csv')

# Covariates

In [None]:
# Inspired by the Mentzer et al. paper [https://www.nature.com/articles/s41467-022-29307-3]
# I will be looking for association between antibodies and the following possible covariates:
#     sex [Field 31]
#     age by decade [Field 21003] -> convert to decade
#     ethnicity [Field 21000] ( There is a genetic ethnic grouping but it's just 409K entries for caucasian)
#         Divided into categories of ‘White’, ‘Asian’, ‘Black’, or ‘Other’ (if not one of the other three)
#     townsend deprivation index (TDI) quintiles [Field 189]
#     household size [field 709]
#          Lives alone, 2, 3, 4, 5, or more
#     tobacco smoking status [field 20116]
#          Current, previous, or never
#     alcohol drinking status [field 20117]
#          Current, previous, or never
#     lifetime number of sexual partners  [Field 2149] (LSP in cats of 0, 1, 2-4, 5-10, > 10)
#         LSP = 0 needs to be pulled from [Field 2139] == -2
#     Self-report of ever same-sex intercourse (sameSI) [Field 2159]
#     age in days 

## Load back in Ab data people list so we can filter

In [None]:
ant_dat = pd.read_csv('./procd/clean_antigen_data.csv', 
                      sep = ',')

ant_peeps = ant_dat['eid'].tolist()

# 9,429
len(ant_peeps)

## Read in file

In [None]:
# raw_covars.txt is a matrix with a header row listing UKB column IDs that
# will map to a covariate, and an initial column representing UKB patient IDs, 
# and then each cell just lists the recorded value for that person.
# Dimensions: [502,522 x 13]
# Example:
# Pat ID   f.01           f.02 ...
# 0001      2              25
# 0002      7              29
# 0003      4              21

# read in covariate data file
cov = pd.read_csv('./procd/new_raw_covars.txt', sep = '\t')

# Translate our column names to english 
cov = cov.rename(columns= {'f.eid' : 'eid',
                           '31-0.0' : 'sex', 
                           '34-0.0' : 'birth_year',
                           '52-0.0' : 'birth_month',
                           '189-0.0' : 'tdi_raw', 
                           '709-0.0' : 'num_in_house', 
                           '2139-0.0' : 'age_at_sex',
                           '2149-0.0' : 'num_sex_part',
                           '2159-0.0' : 'same_sex',
                           '20116-0.0' : 'tobac',
                           '20117-0.0' : 'alc', 
                           '21000-0.0' : 'ethnic',
                           '53-0.0' : 'date_of_first',
                           '21001-0.0' : 'bmi', 
                           '40000-0.0' : 'death_date_0',
                           '40000-1.0' : 'death_date_1'})

# Set the df index to the patient ID
cov = cov.set_index('eid')

## Limit to only our people with antibody data

In [None]:
#502,422
print(len(cov))

cov = cov.loc[ant_peeps,:]

# 9,429
print(len(cov))

In [None]:
# Nan in each columns
# sex                 0
# birth_year          0
# birth_month         0
# date_of_first       0
# tdi_raw             8
# num_in_house       24
# age_at_sex        821
# num_sex_part      909
# same_sex          909
# tobac               6
# alc                 6
# ethnic              6
# bmi                32
# death_date_0     8760
# death_date_1     9428
# get number of NAs in each column
print("Nan in each columns" , cov.isnull().sum().value_counts(), sep='\n')
print("=============================================================")
print("Nan in each columns" , cov.isnull().sum(), sep='\n')

## Convert Sex to int

In [None]:
cov['sex'] = cov['sex'].astype(int)

In [None]:
# 0    5276
# 1    4153
cov['sex'].value_counts(dropna = False)

## Calculate current age

In [None]:
# Diagnoses:
#            - First occurrences released Sept 2019 (data from summer 2019)
#
# Titers:
#            - Blood samples for 9,429 taken between 2006-2010,
#                2012-2013 for 537 but I think these might be the follow-up assessment.
#
#            - Processing of assays on 07/12 or 07/21 of 2016
#
# Covs:
#            - Our data release 5 was pulled Oct 13 2021


In [None]:
import datetime as dt
from datetime import datetime

# Don't want the code to update everytime I run it, so setting date
# instead of using dt.date.today() command.
# Using the first occurrences release date now to calculate age if participant
# is not dead, otherwise, if dead I'll use their death age.
today_str = "2019-09-01"
cov['today'] = pd.to_datetime(today_str)

# Month needs to be 0-padded for date math to work
cov.loc[:, 'birth_month'] = cov.loc[:, 'birth_month'].astype(int).astype(str).str.zfill(2)
cov.loc[:, 'birth_year'] = cov.loc[:, 'birth_year'].astype(int)

# Make everyone born on the first day of the respective month
cov['bday_str'] = cov['birth_year'].astype(str) + '-' + cov['birth_month'].astype(str) + '-01'
cov['bday'] = pd.to_datetime(cov['bday_str'])


# Loop calculating age for each participant taking into account if they are 
# listed as dead or not
age_ls = []
for eid, curr_row in tqdm(cov.iterrows(), total = len(cov)):
    death_ls = list(set(curr_row[['death_date_0', 'death_date_1']].dropna().tolist()))
    
    # Get date time rep of their bday (month and year)
    curr_bday = datetime.strptime(curr_row['bday_str'], '%Y-%m-%d')

    # Boolean to track if there was a death
    is_dead = False
    
    # No death dates, so still alive
    if len(death_ls) == 0:  
        
        # If alive, calculate based off first occurence disease data release (09/2019)
        today_dt = datetime.strptime(today_str, '%Y-%m-%d')

        curr_age_days = today_dt - curr_bday
        curr_age_yrs = curr_age_days.days / 365.25

    
    
    # Participant has death date
    if len(death_ls) > 0:

        is_dead = True
        
        # Date of death
        curr_death = death_ls[0]
        
        # If death, calculate age based off date of death
        death_dt = datetime.strptime(curr_death, '%Y-%m-%d')

        curr_age_days = death_dt - curr_bday
        curr_age_yrs = curr_age_days.days / 365.25
        
        
        
    age_ls.append([eid, curr_age_yrs, is_dead]) 
    
age_df = pd.DataFrame(age_ls, columns = ['eid', 'age_yrs', 'is_dead'])

cov = cov.merge(age_df, how = 'left', left_index = True, right_on = 'eid')
cov = cov.rename(columns = {'age_yrs' : 'age'})

cov = cov.set_index('eid')

# 67.5
# 8.1
# 43.8
# 82.6
print(round(cov['age'].mean(), 1))
print(round(cov['age'].std(), 1))
print(round(cov['age'].min(), 1))
print(round(cov['age'].max(), 1))

### Scale age (divide by 10)

In [None]:
cov['s_age'] = cov['age']/10

## Clean up race/ethnicity data

In [None]:
# Clean up ethnicity data

# Set all NA ethnicity values to code 3 (other)
cov['ethnic'] = cov['ethnic'].fillna(value = -3)
cov['ethnic'] = cov['ethnic'].astype(int)

# Send all base codes to their corresponding thousands version
cov['ethnic'] = cov['ethnic'].replace({
                    1 : 1000, 2 : 2000,
                    3 : 3000, 4 : 4000,
                    5 : 5000, 6 : 6000 })

# And code the non-missing values:
# 0: White
# 1: Asian
# 2: Black
# 3: Other
# Now push them all to their different final codes

cov['ethnic'] = cov['ethnic'].replace({
                    1000 : 0, 1001 : 0, 1002 : 0, 1003 : 0, 
                    2000 : 3, 2001 : 3, 2002 : 3, 2003 : 3, 2004 : 3,
                    3000 : 1, 3001 : 1, 3002 : 1, 3003 : 1, 3004: 1,
                    4000 : 2, 4001 : 2, 4002 : 2, 4003 : 2,
                    -3 : 3, -1 : 3, 
                    5000 : 1, 6000 : 3})



# 0    8883
# 1     231
# 2     140
# 3     175
cov['ethnic'].value_counts(dropna = False).sort_index()

## Number of people in household

In [None]:
# Deal with number of people in household data
# Cluster into groups:
# 1 : 1 person in house
# 2 : 2 people in house
# 3 : 3 people in house
# 4 : 4 people in house
# 5 : 5 or more people in house
# 6 : unknown

# Replace NAs with 0, our code for Unknown
cov['num_in_house'] = cov['num_in_house'].fillna(value = -1)
cov['num_in_house'] = cov['num_in_house'].astype(int)


cov.loc[cov['num_in_house'] == 1,'num_in_house'] = 1
cov.loc[cov['num_in_house'] == 2,'num_in_house'] = 2
cov.loc[cov['num_in_house'] == 3,'num_in_house'] = 3
cov.loc[cov['num_in_house'] == 4,'num_in_house'] = 4
cov.loc[cov['num_in_house'] >= 5,'num_in_house'] = 5
cov.loc[cov['num_in_house'] < 0,'num_in_house'] = 6


# 1    1712
# 2    4331
# 3    1524
# 4    1282
# 5     514
# 6      66
cov['num_in_house'].value_counts(dropna=False).sort_index()

## Tobacco Use

In [None]:
# Never    0
# Previous 1
# Current  2
# Unknown  3

cov['tobac'] = cov['tobac'].fillna(value = 3)
cov['tobac'] = cov['tobac'].astype(int)
cov.loc[cov['tobac'] == -3,'tobac'] = 3



# 0    5210
# 1    3218
# 2     949
# 3      52
cov['tobac'].value_counts(dropna=False).sort_index()

## Alcohol Use

In [None]:
# Never    0
# Previous 1
# Current  2
# Unknown  3

cov['alc'] = cov['alc'].fillna(value = 3)
cov['alc'] = cov['alc'].astype(int)
cov.loc[cov['alc'] == -3, 'alc'] = 3


# 0     417
# 1     332
# 2    8661
# 3      19
cov['alc'].value_counts(dropna=False).sort_index()

## Number of sex partners

In [None]:
# Examining number of sex partners
# Extract the people that haven't had sex from the age_at_sex column
# if age_at_sex == -2, then num_sex_part should = 0.
# I'm also setting those who did not know or preferred not to answer as -1, 
# so they are not confused with 0 partners.

# Mapping method:
#                       0: 0 partners 
#                       1: 1 partner
#                       2: 2-3 partners
#                       3: 4-5 partners
#                       4: > 5 partners

# NA
cov['num_sex_part'] = cov['num_sex_part'].fillna(value = -3)

# 0 partners
no_sex = cov[cov['age_at_sex'] == -2.0].index.to_list()
cov.loc[no_sex,'num_sex_part'] = 0

# 1 partner
cov.loc[cov['num_sex_part'] == 1 ,'num_sex_part'] = 1

# 2-3 partners
cov.loc[(cov['num_sex_part'] == 2) | (cov['num_sex_part'] == 3),'num_sex_part'] = 2

# 4-5 partners
cov.loc[(cov['num_sex_part'] == 4) | (cov['num_sex_part'] == 5),'num_sex_part'] = 3

# More than 5 partners
cov.loc[cov['num_sex_part'] > 5 ,'num_sex_part'] = 4

# Unknown num partners
cov.loc[cov['num_sex_part'] < 0,'num_sex_part'] = 5

cov['num_sex_part'] = cov['num_sex_part'].astype(int)




# 0      88
# 1    2265
# 2    1838
# 3    1255
# 4    2343
# 5    1640
cov['num_sex_part'].value_counts(dropna=False).sort_index()

## Same Sex Intercourse 

In [None]:
# Finally, same sex intercourse
# Set NAs to unknown code (2) and also set prefer not to answer (-3) and
# whatever 3 is to unknown code (2)
cov['same_sex'] = cov['same_sex'].fillna(value = 2)
cov.loc[(cov['same_sex'] == -3 ) | (cov['same_sex'] == 3 ),'same_sex'] = 2

cov['same_sex'] = cov['same_sex'].astype(int)


# 0    8204
# 1     283
# 2     942
cov['same_sex'].value_counts(dropna=False).sort_index()

## Limit only to important columns before imputing

In [None]:
cov['age'] = cov['s_age']
cov = cov[['sex', 'bmi', 'age', 'ethnic', 
           'tdi_raw', 'num_in_house', 'tobac', 
           'alc', 'num_sex_part', 'same_sex',
           'is_dead']]

## Impute Missing BMI and TDI Using MICE

In [None]:
# get number of NAs in each column

# Nan in each columns
# bmi             32
# tdi_raw          8
#  Rest are 0's
print("Nan in each columns" , cov.isnull().sum().value_counts(), sep='\n')
print("=============================================================")
print("Nan in each columns" , cov.isnull().sum(), sep='\n')

In [None]:
# Make sure we are only using other covs to impute not other
# columns we are keeping around for lateer
cov = cov.reset_index(drop = False)

addl_covs = cov.loc[:, ['eid', 'is_dead']]

cov = cov.loc[:, ['eid', 'sex', 'bmi', 'age', 'ethnic', 'tdi_raw', 
            'num_in_house', 'tobac', 'alc', 'num_sex_part', 'same_sex']]

cov = cov.set_index('eid')

In [None]:
# Import Iterative Imputer
from sklearn.impute import IterativeImputer

# Create the imputer
MICE = IterativeImputer()

# Do the imputation
ret = MICE.fit_transform(cov)

# Convert imputed data from np array to df
ret_df = pd.DataFrame(ret)

# Reset the column names and indices
ret_df.columns = cov.columns
ret_df.index = cov.index

cov = ret_df.copy(deep = True)

In [None]:
# Merge back in the additional covs now that impute is finished
cov = cov.merge(addl_covs, how = 'left', left_index = True, right_on = 'eid')
cov = cov.set_index('eid')

In [None]:
# get number of NAs in each column
# All columns now have 0 NAs
print("Nan in each columns" , cov.isnull().sum().value_counts(), sep='\n')
print("=============================================================")
print("Nan in each columns" , cov.isnull().sum(), sep='\n')

## Townsend Deprivation Index (TDI)

### Now bin TDI

In [None]:
# We are binning as follows, lower 1/6th, middle 2/3, and upper 1/6th 
# So we label everyone by their bin
cov['tdi_bin'] = pd.qcut(x = cov['tdi_raw'], q = 6, labels = False)

# Now we change their number based on our scheme
cov['tdi_quant'] = cov['tdi_bin'].replace({
                                                0: 0,
                                                1: 1,
                                                2: 1,
                                                3: 1, 
                                                4: 1, 
                                                5: 2
                                            })

cov['tdi_quant'] = cov['tdi_quant'].astype(int)

# 0    1572
# 1    6285
# 2    1572
cov['tdi_quant'].value_counts(dropna = False).sort_index()

## BMI

In [None]:
# 27.3
# 4.8
# 16.2
# 61.3
print(round(cov['bmi'].mean(), 1))
print(round(cov['bmi'].std(), 1))
print(round(cov['bmi'].min(), 1))
print(round(cov['bmi'].max(), 1))

### Scale BMI (divide by 10)

In [None]:
cov['s_bmi'] = cov['bmi']/10

## Clean up

In [None]:
cov['bmi'] = cov['s_bmi']
cov = cov[['sex', 'bmi', 'age', 'ethnic', 'tdi_quant', 
           'num_in_house', 'tobac', 'alc', 'num_sex_part', 'same_sex',
          'is_dead']]

In [None]:
# Some of the column types got reset along the way so fix those
cov.loc[:, ['sex', 'ethnic', 'tdi_quant', 'num_in_house',
            'tobac', 'alc', 'num_sex_part', 'same_sex']]  = \
    cov.loc[:, ['sex', 'ethnic', 'tdi_quant', 'num_in_house',
                'tobac', 'alc', 'num_sex_part', 'same_sex']].astype(int)

## Summary statistics 

In [None]:
print(cov['sex'].value_counts().sort_index())

print("BMI")
print((cov['bmi'] * 10).mean())
print((cov['bmi'] * 10).std())
print((cov['bmi'] * 10).median())
print((cov['bmi'] * 10).min())
print((cov['bmi'] * 10).max())

print("AGE")
print(cov['age'].mean())
print(cov['age'].std())
print(cov['age'].median())
print(cov['age'].min())
print(cov['age'].max())


print("ETHNIC")
print(cov['ethnic'].value_counts().sort_index())


print("TDI")
print(cov['tdi_quant'].value_counts().sort_index())


print("Num in House")
print(cov['num_in_house'].value_counts().sort_index())

print("Tobac")
print(cov['tobac'].value_counts().sort_index())

print("Alc")
print(cov['alc'].value_counts().sort_index())


print("Num SEX Partners")
print(cov['num_sex_part'].value_counts().sort_index())

print("Same-sex Int")
print(cov['same_sex'].value_counts().sort_index())

## Save file

In [None]:
# Outputting a matrix with rows for each participant that has Ab data
# and their values for each of the covariates we will examine later
# Dimensions: [9,429 x 11]
# Example:
# Pat ID   sex    age ...
# 0001       1     6.76 ...
# 0002       0     8.06 ...
# ...
cov.to_csv('./procd/new_cov_dat.csv', index = True)

## Remove withdrawn participants

In [None]:
# Read in latest file from UKB
pats_to_rm_fn = f'{UKB_DAT_HOME}/other/participants_to_remove_as_of_05_03_23.csv'
pats_to_rm = pd.read_csv(pats_to_rm_fn, names = ['eid'])
pats_to_rm_ls = pats_to_rm['eid'].tolist()

# 161 
print(len(pats_to_rm_ls))

In [None]:
# Read files back in
cov = pd.read_csv('./procd/new_cov_dat.csv')

# Most of the data was pre-cleaned, so we only have 1 person in data
# here that we need to remove
# Dimensions: [9430 x 11]
print(cov.shape)


# Remove them
cov = cov.loc[~cov['eid'].isin(pats_to_rm_ls), :]

# Dimensions: [9429 x 11]
print(cov.shape)

# Write the updated file out
cov.to_csv('./procd/new_cov_dat.csv')

# Cancer Data

## Load back in Ab data people list so we can filter

In [None]:
ant_dat = pd.read_csv('./procd/clean_antigen_data.csv', sep = ',')

ant_peeps = ant_dat['eid'].tolist()

# 9,429
len(ant_peeps)

## Read in cancer data file and keep only people with antibody data

In [None]:
# cancer_diags.txt is a matrix with a header row listing UKB column IDs,
# and an initial column representing UKB patient IDs, and then each cell
# lists the ICD10 code for the cancer that person has been diagnoses with
# up to a max of 17 different cancer diagnoses. 
# We filter this only keeping people that have antibody data, so our output
# is 9,429 x 18.
# Dimensions: [502,522 x 17 (with eid as index)]
# Example:
# Pat ID   f.01           f.02 ...
# 0001
# 0002      C67           
# 0003 

# Get number of lines in input file so we have a total for tqdm
cancer_fn = f'{UKB_DAT_HOME}/pheno/third_release/partly_procd_files/disease_slices/cancer_diags.txt'
num_lines = sum(1 for line in open(cancer_fn, 'r'))



# Matrix where we will dump rows from the file we want to keep.
X = []

# Open our cancer diag file for reading
with open(cancer_fn) as f:
    for line in tqdm(f, total=num_lines, desc = "Cancer"):
        line_spl = line.rstrip('\n').split('\t')
        
        # Skip header line
        if line_spl[0] == 'f.eid':
            continue
        
        # Only keep cancer diags for people with Abs data
        curr_peep = int(line_spl[0])
        if curr_peep in ant_peeps:
            X.append(line_spl)
            
# Create the df
can_dat = pd.DataFrame(X, columns = 
                      ['eid', 'cancer_1', 'cancer_2', 'cancer_3', 'cancer_4', 'cancer_5','cancer_6',
                       'cancer_7', 'cancer_8', 'cancer_9', 'cancer_10', 'cancer_11', 'cancer_12',
                       'cancer_13', 'cancer_14', 'cancer_15', 'cancer_16', 'cancer_17'])

# convert pat ID to ints
can_dat.loc[:,'eid'] = can_dat.loc[:,'eid'].astype(int)

# Use the patient ids for row names
can_dat = can_dat.set_index('eid')

In [None]:
# Only have rows for people that have Ab data and a column for 
# each cancer diagnoses a person might have up to a max of 17
# The actual field will represent the ICD10 code for the cancer
# that person was diagnoses with.
# Example
# eid     cancer_1     cancer_2
# 0001      NA          NA
# 0002      C67         NA 
# 0003       NA         NA

can_dat.to_csv('procd/can_dat.csv', index = True)

## Generating cancer code dictionary

In [None]:
# This file is simply a list of all cancer ICD10 codes listed in the UKB showcase
# page under health related outcomes - cancer register.
# 
# Dimensions: [665 x 1]
# Example:
# C00
# C00.0
# C00.1
# C00.2

can_types = pd.read_excel('dicts/cancer_titles.xlsx', header = None, names = ["code"],
                         engine = 'openpyxl')

In [None]:
# This code creates both a 3-char ICD10 list with descriptions, e.g. C01 (no C01.0)

# https://pypi.org/project/icd10-cm/
import icd10

import re

roll_code = []

# Create regex that matches only C00 or C15, skips all decimals.
# We are using this to roll up all decimals into their C15 parent.
code_cat = re.compile(r'^[A-Z]\d\d$')

# Loop through all codes and find descriptions and put in proper lists
for x, curr_code in can_types.iterrows():
    code = curr_code.to_list()[0]

    # Roll up
    if code_cat.match(code):
        # If the code exits pull down the description
        if icd10.exists(code):
            icd = icd10.find(code)
            roll_code.append([code, icd.description])


 
# Add in the ones that aren't found by ICD10 module (ICD-0-3 specific)
roll_code.append(["C42", "hematopoietic and reticuloendothelial systems"])
roll_code.append(["C97", "Malignant neoplasms of independent (primary) multiple site"])    

# Sort list by code
roll_code.sort()

## Processing Cancer Data

### Function to Fix ICD10 codes

In [None]:
## Function to fix ICD10 codes
# ICD codes for cancer aren't in decimal form, so we need to reverse this.
# C504 needs to become C50.4
# ICD10 code is always "letter number number decimal"

def fix_icd(code_list):
    ret_code = []
    code = re.compile(r'^([A-Z]\d\d)')

    # Check to see if we got a string or a list
    # Got a string
    if type(code_list) is str:
        spl = code.split(code_list)
        main_code = spl[1]
    
        if spl[2] != '':
            ret_code.append(f"{spl[1]}.{spl[2]}")
    
    # List of strings
    else:
        # Fix any ICD code we found
        for y in code_list:
            spl = code.split(y)
            main_code = spl[1]
          
            # We have a decimal component! [C443]
            if spl[2] != '':
                ret_code.append(f"{spl[1]}.{spl[2]}")
            
            # No decimal [C61]
            else:
                ret_code.append(f"{spl[1]}")

    return ret_code


### Actual processing

In [None]:
# This function loops through our can_dat data, which is cancer diagnoses for all of our
# UKB people that are also in the Ab data.
# For each person it gets a list of all their cancer diags and increments the cancer 

# Form our column names in form "Cancer descrip. [ICD10 code]"
roll_code_cols = []

for x in roll_code:
    roll_code_cols.append(f"{x[1]} [{x[0]}]")


# Add ID cols
roll_code_cols.insert(0, 'eid')

# Y will be array of roll_dat
Y = []

# Looping through each person, taking all non NA codes and mapping to spec_dat and roll_dat
for x, curr_row in tqdm(can_dat.iterrows(), total = can_dat.shape[0], desc = "Proc Cancer") :

    # NA could be either np.na or string 'NA',  so convert any strings first
    curr_row = curr_row.replace({'NA' : np.nan})
    curr_cans = curr_row[~curr_row.isna()].to_list()
    # Generate arrays full of False
    curr_Y = ((np.full(len(roll_code_cols), False)).tolist())

    # Overwrite False in pos 0 with pat IDs -> works because we are using len for
    # defining size of curr_X in above code
    curr_Y[0] = x

    # No cancer diags, leave all diagnoses as false and move on
    if len(curr_cans) == 0:
        
        Y.append(curr_Y)
        
        continue
    
    # We do have cancer diags
    else:
        
        # Fix the ICD10 code list
        curr_cans = [i.replace('"', '') for i in curr_cans]
        curr_cans = fix_icd(curr_cans)
        
        # Loop through each diagnosis and update the current False to a True
        for y in curr_cans:
            
            # Do we have a decimal in ICD code?
            # Note we are offsetting our inds because the results array (X and Y) have an extra
            # column inserted for ID at pos 0.
            if len(y) > 3:

                # Get roll-up
                # Break into category, so C44.3 -> C44
                code = re.compile(r'^([A-Z]\d\d)')
                found = code.findall(y)
                inds = [i for i, s in enumerate(roll_code_cols) if found[0]  in s]

                inds = inds[0] 
                curr_Y[inds] = True

            # No decimal
            else:
                # roll up    
                inds = [i for i, s in enumerate(roll_code_cols) if y  in s]
                inds = inds[0]
                
                curr_Y[inds] = True       
    


    Y.append(curr_Y)
    
roll_dat = pd.DataFrame(Y, columns = roll_code_cols)

# convert pat ID to ints
roll_dat.loc[:,'eid'] = roll_dat.loc[:,'eid'].astype(int)

# Use the patient ids for row names
roll_dat = roll_dat.set_index('eid')

In [None]:
# This file will simply be a data matrix where each column
# is a different cancer diagnosis and each row is patient ID
# The cell is True if that patient in the current row has
# been diagnoses with the cancer of the current column, otherwise
# the cell is False.
# Example
# eid     C00     C01
# 0001    False   False         
# 0002    True    False       
# 0003    False   False 
roll_dat.to_csv('/procd/rolled_code.csv')

## Remove withdrawn participants

In [None]:
# Read in latest file from UKB
pats_to_rm_fn = f'{UKB_DAT_HOME}/other/participants_to_remove_as_of_05_03_23.csv'
pats_to_rm = pd.read_csv(pats_to_rm_fn, names = ['eid'])
pats_to_rm_ls = pats_to_rm['eid'].tolist()

# 161 
print(len(pats_to_rm_ls))

In [None]:
# Read files back in
roll_dat = pd.read_csv('./procd/rolled_code.csv')

# Most of the data was pre-cleaned, so we only have 1 person in data
# here that we need to remove
# Dimensions: [9430 x 128]
print(roll_dat.shape)


# Remove them
roll_dat = roll_dat.loc[~roll_dat['eid'].isin(pats_to_rm_ls), :]

# Dimensions: [9429 x 128]
print(roll_dat.shape)

# Write the updated file out
roll_dat.to_csv('./procd/rolled_code.csv')

# Non-cancer Diseases

## Load back in Ab data people list so we can filter

In [None]:
ant_dat = pd.read_csv('./procd/clean_antigen_data.csv', sep = ',')

ant_peeps = ant_dat['eid'].tolist()

# 9,429
len(ant_peeps)

In [None]:
# Get dataframe column headers
# This dataframe converts the UKB column ID in our first occurrence data 
# file to an actual disease diagnoses name and ICD10

# Read in all_dis_first_occur_dict.xlsx - we will need for column headers
dis_head = pd.read_excel("dicts/all_dis_first_occur_dict.xlsx", 
                          engine = 'openpyxl')

# Drop the data rows
#dis_head = dis_head[~dis_head['Data Type'].str.contains('Date')]
dis_head = dis_head[:]['Disease'].tolist()


In [None]:
# first_occur_all_dis.txt is a matrix with a header row listing UKB column IDs,
# and an initial column representing UKB patient IDs, and the remaining
# columns will map back to a non-cancer disease diagnosis. The cell will either
# be NA is the patient does not have that diagnosis or have an integer code
# which UKB includes to indicate the source of the data.
# Dimensions: [502,522 x 2,255]
# Example:
# Pat ID   f.01           f.02 ...
# 0001
# 0002      30           
# 0003 

# Get number of lines in input file so we have a total for tqdm
non_cancer_fn = f'{UKB_DAT_HOME}/pheno/third_release/partly_procd_files/disease_slices/first_occur_all_dis.txt'

print("Counting lines...")
num_lines = sum(1 for line in open(non_cancer_fn, 'r'))

print("Finished counting lines!")

X = []
with open(non_cancer_fn) as f:
    for line in tqdm(f, total=num_lines, desc = "First Occur."):
        line_spl = line.rstrip('\n').split("\t")
        
        # Skip header line
        if line_spl[0] == 'f.eid':
            continue
        
        curr_peep = int(line_spl[0])
        if curr_peep in ant_peeps:
            X.append(line_spl)
            
# Create the df
dis_dat = pd.DataFrame(X, columns = dis_head)

# rename pat ID column then convert it to ints
# A bunch of columns were missing and needed to be looked up manually. I've dumped those here to rename cols
dis_dat = dis_dat.rename(columns={
                                   'Unique patient ID' : 'eid',
            '130191-0.0' : 'other viral infections characterised by skin and mucous membrane lesions, not elsewhere classified[B08]',
            '130193-0.0' : 'unspecified viral infection characterised by skin and mucous membrane lesions[B09]',
            '130205-0.0' : 'human immunodeficiency virus [hiv] disease resulting in infectious and parasitic diseases[B20]',
            '130209-0.0' : 'human immunodeficiency virus [hiv] disease resulting in other specified diseases[B22]',
            '130337-0.0' : 'streptococcus and staphylococcus as the cause of diseases classified to other chapters[B95]',
            '130339-0.0' : 'other bacterial agents as the cause of diseases classified to other chapters[B96]',
            '130343-0.0' : 'other specified infectious agents as the cause of diseases classified to other chapters[B98]',
            '130673-0.0' : 'certain diseases involving lymphoreticular tissue and reticulohistiocytic system[D76]',
            '130675-0.0' : 'other disorders of blood and blood-forming organs in diseases classified elsewhere[D77]',
            '130845-0.0' : 'organic amnesic syndrome, not induced by alcohol and other psychoactive substances[F04]',
            '130849-0.0' : 'other mental disorders due to brain damage and dysfunction and to physical disease[F06]',
            '130851-0.0' : 'personality and behavioural disorders due to brain disease, damage and dysfunction[F07]',
            '130865-0.0' : 'mental and behavioural disorders due to use of other stimulants, including caffeine[F15]',
            '130873-0.0' : 'mental and behavioural disorders due to multiple drug use and use of other psychoactive substances[F19]',
            '130925-0.0' : 'mental and behavioural disorders associated with the puerperium, not elsewhere classified[F53]',
            '130927-0.0' : 'psychological and behavioural factors associated with disorders or diseases classified elsewhere[F54]',
            '130931-0.0' : 'unspecified behavioural syndromes associated with physiological disturbances and physical factors[F59]',
            '130945-0.0' : 'psychological and behavioural disorders associated with sexual development and orientation[F66]',
            '130985-0.0' : 'disorders of social functioning with onset specific to childhood and adolescence[F94]',
            '130989-0.0' : 'other behavioural and emotional disorders with onset usually occurring in childhood and adolescence[F98]',
            '131003-0.0' : 'encephalitis, myelitis and encephalomyelitis in diseases classified elsewhere[G05]',
            '131007-0.0' : 'intracranial and intraspinal abscess and granuloma in diseases classified elsewhere[G07]',
            '131019-0.0' : 'systemic atrophies primarily affecting central nervous system in diseases classified elsewhere[G13]',
            '131041-0.0' : 'other degenerative disorders of nervous system in diseases classified elsewhere[G32]',
            '131201-0.0' : 'disorders of optic 2nd nerve and visual pathways in diseases classified elsewhere[H48]',
            '131269-0.0' : 'postprocedural disorders of ear and mastoid process, not elsewhere classified[H95]',
            '131371-0.0' : 'occlusion and stenosis of precerebral arteries, not resulting in cerebral infarction[I65]',
            '131373-0.0' : 'occlusion and stenosis of cerebral arteries, not resulting in cerebral infarction[I66]',
            '131395-0.0' : 'disorders of arteries, arterioles and capillaries in diseases classified elsewhere[I79]',
            '131517-0.0' : 'respiratory conditions due to inhalation of chemicals, gases, fumes and vapours[J68]',
            '131687-0.0' : 'disorders of gallbladder, biliary tract and pancreas in diseases classified elsewhere[K87]',
            '131839-0.0' : 'other disorders of skin and subcutaneous tissue in diseases classified elsewhere[L99]',
            '131843-0.0' : 'direct infections of joint in infectious and parasitic diseases classified elsewhere[M01]',
            '132151-0.0' : 'pain and other conditions associated with female genital organs and menstrual cycle[N94]',
            '132181-0.0' : 'pre-existing hypertension complicating pregnancy, childbirth and the puerperium[O10]',
            '132187-0.0' : 'gestational [pregnancy-induced] hypertension without significant proteinuria[O13]',
            '132259-0.0' : 'labour and delivery complicated by intrapartum haemorrhage, not elsewhere classified[O67]',
            '132307-0.0' : 'death from any obstetric cause occurring more than 42 days but less than one year after delivery[O96]',
            '132311-0.0' : 'maternal infectious and parasitic diseases classifiable elsewhere but complicating pregnancy, childbirth and the puerperium[O98]',
            '132313-0.0' : 'other maternal diseases classifiable elsewhere but complicating pregnancy, childbirth and the puerperium[O99]',
            '132315-0.0' : 'foetus and newborn affected by maternal conditions that may be unrelated to present pregnancy[P00]',
            '132319-0.0' : 'foetus and newborn affected by complications of placenta, cord and membranes[P02]',
            '132323-0.0' : 'foetus and newborn affected by noxious influences transmitted via placenta or breast milk[P04]',
            '132327-0.0' : 'disorders related to short gestation and low birth weight, not elsewhere classified[P07]',
            '132353-0.0' : 'interstitial emphysema and related conditions originating in the perinatal period[P25]',
            '132397-0.0' : 'transitory disorders of carbohydrate metabolism specific to foetus and newborn[P70]',
            '132535-0.0' : 'congenital obstructive defects of renal pelvis and congenital malformations of ureter[Q62]',
            '132569-0.0' : 'congenital malformations of musculoskeletal system, not elsewhere classified[Q79]',
            '132583-0.0' : 'congenital malformation syndromes due to known exogenous causes, not elsewhere classified[Q86]',
            '132585-0.0' : 'other specified congenital malformation syndromes affecting multiple systems[Q87]',
            '132593-0.0' : 'other trisomies and partial trisomies of the autosomes, not elsewhere classified[Q92]',
            '132601-0.0' : 'other sex chromosome abnormalities, female phenotype, not elsewhere classified[Q97]',
            '132603-0.0' : 'other sex chromosome abnormalities, male phenotype, not elsewhere classified[Q98]'

                                    })
dis_dat.loc[:,'eid'] = dis_dat.loc[:,'eid'].astype(int)

# Use the patient ids for row names
dis_dat = dis_dat.set_index('eid')

# Remove Date columns, not needed for now.
dis_dat = dis_dat[dis_dat.columns.drop(list(dis_dat.filter(regex='Date')))]

# Convert coding to disease status flags
dis_dat = dis_dat.replace({
                    "NA" : False,
                    "40" : True,
                    "41" : True,
                    "20" : True,
                    "21" : True,
                    "30" : True,
                    "31" : True,
                    "50" : True, 
                    "51" : True
                })

In [None]:
# These files will simply be a data matrix where each column
# is a different non-cancer diagnosis and each row is patient ID
# The cell is True if that patient in the current row has
# been diagnosed with the disease of the current column, otherwise
# the cell is False.
# Dimensions: [9,429 x 1127]
# Example
# eid     A00     A01
# 0001    False   False         
# 0002    True    False       
# 0003    False   False 
dis_dat.to_csv('./procd/dis_dat.csv', index = True)

## Remove withdrawn participants

In [None]:
# Read in latest file from UKB
pats_to_rm_fn = f'{UKB_DAT_HOME}/other/participants_to_remove_as_of_05_03_23.csv'
pats_to_rm = pd.read_csv(pats_to_rm_fn, names = ['eid'])
pats_to_rm_ls = pats_to_rm['eid'].tolist()

# 161 
print(len(pats_to_rm_ls))

In [None]:
# Read files back in
dis_dat = pd.read_csv('./procd/dis_dat.csv')

# Most of the data was pre-cleaned, so we only have 1 person in data
# here that we need to remove
# Dimensions: [9430 x 128]
print(dis_dat.shape)


# Remove them
dis_dat = dis_dat.loc[~dis_dat['eid'].isin(pats_to_rm_ls), :]

# Dimensions: [9429 x 128]
print(dis_dat.shape)

# Write the updated file out
dis_dat.to_csv('./procd/dis_dat.csv')