# Symptom prevalence and covid status tables

## Import libraries 

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Set working dir_ 

In [2]:
os.chdir('S:\LLC_0028\data')

In [3]:
if 'symptom_analysis' not in os.listdir():
    os.mkdir('symptom_analysis')
    
#os.chdir('symptom_analysis')

## Import data 

In [4]:
dta = pd.read_csv('./harmonised_all/llc_0028_full_harmonised_data_all.csv', index_col=0)

In [5]:
dta = dta[dta.columns[1:29].to_list() + ['covid_status']] #only need symptoms, study and status

In [6]:
#dta.head()

Unnamed: 0,study,fever,cough,throat,chest_tight,breath,nose,aches,fatigue,diarrhoea,...,dizzy,chest_pain,chills,sleep,numb,heavy,swelling,concentrating,memory,covid_status
0,bcs70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bcs70,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,bcs70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bcs70,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,bcs70,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
symptoms = dta.columns[1:-1]

## Prevalence table 

In [8]:
os.chdir('symptom_analysis')

In [9]:
prevalence = pd.DataFrame(index = symptoms)

In [10]:
#dta.describe()

Unnamed: 0,fever,cough,throat,chest_tight,breath,nose,aches,fatigue,diarrhoea,smell_taste,...,dizzy,chest_pain,chills,sleep,numb,heavy,swelling,concentrating,memory,covid_status
count,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,...,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,53224.0,52981.0
mean,0.01178,0.070983,0.056704,0.038103,0.048174,0.174902,0.095577,0.126635,0.044003,0.01597,...,0.016647,0.005824,0.006181,0.052025,0.018262,0.010089,0.002179,0.103919,0.070551,0.274438
std,0.107897,0.256799,0.231278,0.191447,0.214135,0.379887,0.294013,0.332566,0.205103,0.125361,...,0.127945,0.076096,0.078379,0.22208,0.1339,0.099939,0.046634,0.305159,0.256076,0.647522
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [11]:
prevalence['Pooled'] = [ round(v,2) for v in dta.describe().T['mean'].values[:-1]*100]

for study,group in dta.groupby('study'):
    
    df = group.describe().T
    #add a catch so that if prevalence * sample size < 10, fill with 
    #00.0*. When v==0, it is because a study didnt ask about that symptom
    #therefore this is left as 0.00 (not starred)
    prevalence[study] = [ round(v,2) if v*df['count'].values[0] >10 or v==0 \
                         else '0.00*' for v in df['mean'].values[:-1]*100]

In [12]:
prevalence

Unnamed: 0,Pooled,alspac,bcs70,bib,mcs,ncds,nextstep,nhsd46,track19,twins
fever,1.18,0.05,1.27,7.04,1.68,0.89,1.89,1.23,0.97,0.93
cough,7.1,0.02,12.74,3.2,11.67,16.35,10.34,19.39,0.83,4.15
throat,5.67,0.14,8.79,18.08,8.64,7.32,9.07,4.12,2.47,7.31
chest_tight,3.81,0.17,6.41,7.84,6.5,4.95,6.38,2.97,1.57,3.57
breath,4.82,0.17,9.25,7.68,7.9,10.88,6.6,10.71,0.0,4.75
nose,17.49,0.65,25.05,26.4,23.29,26.78,22.18,30.97,10.8,16.96
aches,9.56,0.19,18.45,20.8,14.4,17.4,14.03,16.57,2.91,6.42
fatigue,12.66,0.67,23.19,30.24,21.54,17.65,26.28,14.47,0.9,18.59
diarrhoea,4.4,0.19,6.08,7.52,5.45,5.65,6.76,5.86,1.86,9.36
smell_taste,1.6,0.07,2.28,0.0,1.97,2.28,1.89,2.17,1.21,1.84


In [13]:
prevalence.to_csv('llc_0028_symptom_prevalence_data_v1.csv')

## Covid status table

In [14]:
#create arrays for multi-index dataframe

arr_2 = ['Pooled','Pooled','Pooled']

for study in dta.study.unique():
    
    for i in range(3):
        
        arr_2.append(study)

In [15]:
        
arr_1 = ['No covid', 'Covid < 12 weeks ago', 'Covid > 12 weeks ago']  

for group in set(arr_2):
    
    for v in arr_1[:3]:
        
        arr_1.append(v)

In [16]:
index = pd.MultiIndex.from_tuples(list(zip(arr_2, arr_1)), names = ['Study','Covid Status'])
status = pd.DataFrame(index=index)


In [17]:
N, perc, N_asymp, perc_asymp = [],[],[],[]

for group in arr_2[::3]:
    
    if group=='Pooled':
        
        df = dta
        
    else:
        
        df = dta.loc[dta.study==group]
        
    for covid_status in [0,1,2]:
    
        df2 = df.loc[df.covid_status==covid_status]
        
        N.append(df2.shape[0])
        perc.append(round(df2.shape[0]*100/df.shape[0]))
        
        asymp = df2[symptoms].loc[(df2[symptoms]==0).all(axis=1)]
        n = asymp.shape[0]
        
        # add catches for discolsure control
        if n<10:
            N_asymp.append('<10')
            perc_asymp.append('-')
        elif df2.shape[0]-n<10:
            N_asymp.append('-*')
            perc_asymp.append('-')
        else:
            N_asymp.append(asymp.shape[0])
            perc_asymp.append(round(asymp.shape[0]*100/df2.shape[0]))
        
status['N'] = N
status['%'] = perc
status['N asymptomatic'] = N_asymp
status['% asymptomatic'] = perc_asymp

In [18]:
status

Unnamed: 0_level_0,Unnamed: 1_level_0,N,%,N asymptomatic,% asymptomatic
Study,Covid Status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pooled,No covid,44273,83,24881,56
Pooled,Covid < 12 weeks ago,2876,5,1429,50
Pooled,Covid > 12 weeks ago,5832,11,2758,47
bcs70,No covid,4249,82,1730,41
bcs70,Covid < 12 weeks ago,304,6,76,25
bcs70,Covid > 12 weeks ago,608,12,151,25
bib,No covid,555,89,211,38
bib,Covid < 12 weeks ago,14,2,<10,-
bib,Covid > 12 weeks ago,56,9,18,32
mcs,No covid,7206,77,2803,39


In [19]:
status.to_csv('llc_0028_covidstatus_data_v1.csv')