In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression

Load sick and healthy patients

In [2]:
with open("data/eid_sick") as f:
    eid_sick = list(map(int, f.read().split('\n')))

with open("data/eid_healthy") as f:
    eid_healthy = list(map(int, f.read().split('\n')))

len(eid_sick), len(eid_healthy)

(26314, 476056)

Load column descriptions

In [3]:
column_desc = {}

with open('data/my_nmr_fields') as f:
    nmr_fields = [field.split('\t') for field in f.read().split('\n')]    
    column_desc['NMR'] = {f'{fid}-0.0': finfo for fid, finfo in nmr_fields}


print(f'{len(column_desc)} categories, total of {sum([len(fields) for fields in column_desc.values()])} columns')
print('===================================')
for category, cols in column_desc.items():
    print(f'{category}: {len(cols)} columns')
    
nmr_columns = list(column_desc['NMR'].keys())

1 categories, total of 249 columns
NMR: 249 columns


Read my data

In [4]:
df_nmr_sick = pd.read_csv('data/sick_nmr_data.csv', 
                      usecols=['eid'] + nmr_columns, 
                      index_col=['eid'])
df_nmr_healthy = pd.read_csv('data/my_nmr_data.csv', 
                         usecols=['eid'] + nmr_columns, 
                         index_col=['eid'],
                         nrows=2 * len(df_nmr_sick))
df_nmr_healthy = df_nmr_healthy.loc[list(set(eid_healthy) & set(df_nmr_healthy.index))[:len(df_nmr_sick)]]

len(df_nmr_sick), len(df_nmr_healthy)

(26314, 26314)

In [5]:
df_nmr_sick['diagnosis'] = 1
df_nmr_healthy['diagnosis'] = 0
df_raw = pd.concat([df_nmr_sick, df_nmr_healthy])

df_raw[df_raw == -10] = 0
df_raw[df_raw < 0] = np.nan

In [6]:
eid_train, eid_test = train_test_split(df_raw.index.to_list(), test_size=0.1, random_state=42)
eid_train, eid_val = train_test_split(eid_train, test_size=0.1, random_state=42)

len(eid_train), len(eid_val), len(eid_test)

(42628, 4737, 5263)

In [7]:
nulls = df_raw[nmr_columns].isna().sum(axis=0)
print(nulls)

23474-0.0    24302
23475-0.0    23766
23476-0.0    23753
23477-0.0    23753
23460-0.0    23761
             ...  
23588-0.0    24012
23608-0.0    23754
23469-0.0    23779
23403-0.0    23754
23467-0.0    23771
Length: 249, dtype: int64


In [9]:
df_raw.dropna(inplace=True)
df_raw.isna().sum().sum()

0

In [10]:
df_train, df_test = train_test_split(df_raw, test_size=0.1, random_state=42) 
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42) 

In [11]:
X_train = df_train.drop(columns=['diagnosis'])
X_val = df_val.drop(columns=['diagnosis'])
X_test = df_test.drop(columns=['diagnosis'])

scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

y_train = df_train['diagnosis']
y_val = df_val['diagnosis']
y_test = df_test['diagnosis']

(
    (y_train.sum(), len(y_train)),
    (y_val.sum(), len(y_val)),
    (y_test.sum(), len(y_test))
)

((10850, 21314), (1179, 2369), (1346, 2632))

In [13]:
lr = LogisticRegression(C=0.1, max_iter=2000) .fit(X_train, y_train)

0.6935415787252005

In [16]:
(
    lr.score(X_val, y_val),
    lr.score(X_test, y_test),
    f1_score(y_true=y_val, y_pred=lr.predict(X_val)), 
    f1_score(y_true=y_test, y_pred=lr.predict(X_test)),
    roc_auc_score(y_true=y_val, y_score=lr.predict_proba(X_val)[:, 1]) ,
    roc_auc_score(y_true=y_test, y_score=lr.predict_proba(X_test)[:, 1]) 
)

(0.6935415787252005,
 0.6869300911854104,
 0.6894781864841745,
 0.6963890935887989,
 0.7648398799723451,
 0.7524356482775991)