In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import correlation_confidence_interval

# Correlation between virus load and diagnoses

## Import and format data

In [2]:
diagnoses = pd.read_csv("../data/diagnoses.csv")

# Drop the 'geography' column
diagnoses.drop(columns=['geography'], inplace=True)

# Filter out the columns based on the condition
columns_to_keep = [col for col in diagnoses.columns if '0-200' in col and 'ratio' not in col]

# Keep only the desired columns (starting from the second column)
diagnoses = diagnoses[['date'] + columns_to_keep]

# Group by 'date' and sum up the values in each column - sums up numbers for all states
diagnoses = diagnoses.groupby('date').sum().reset_index()

In [3]:
virus = pd.read_excel("../data/combined_df.xlsx")
virus = virus[["date", "viral load"]].dropna()

In [4]:
# Align dates of virus to dates of diagnoses for inner join

# Convert 'date' column to datetime format
virus['date'] = pd.to_datetime(virus['date'])

# Subtract two days from each date
virus['date'] = virus['date'] - pd.Timedelta(days=2)

# Convert the dates back to the string format YYYY-MM-DD
virus['date'] = virus['date'].dt.strftime('%Y-%m-%d')

# Now the 'date' column in 'virus' DataFrame will have dates that are two days earlier in the format YYYY-MM-DD

In [5]:
# Joining the two DataFrames on the 'date' column
merged_df = pd.merge(virus, diagnoses, on='date', how='inner')

# Create a new column 'infection numbers' by summing up all columns except 'date' and 'viral load'
merged_df['summed up infection numbers'] = merged_df.drop(columns=['date', 'viral load']).sum(axis=1)

## Correlation

In [10]:
correlations = merged_df.drop(columns=['date']).corr(method='spearman') 

In [11]:
correlations

Unnamed: 0,viral load,distinct_patient_count J00-J06 Acute upper respiratory infections 0-200,distinct_patient_count J1-J2 RSV 0-200,distinct_patient_count J10-J18 Influenza and pneumonia 0-200,distinct_patient_count J20-J22 Other acute respiratory infections 0-200,distinct_patient_count U0 COVID-19 0-200,summed up infection numbers
viral load,1.0,0.904209,0.648583,0.799966,0.854862,0.893964,0.913771
distinct_patient_count J00-J06 Acute upper respiratory infections 0-200,0.904209,1.0,0.653941,0.838555,0.955434,0.889268,0.980705
distinct_patient_count J1-J2 RSV 0-200,0.648583,0.653941,1.0,0.723429,0.720857,0.744777,0.708254
distinct_patient_count J10-J18 Influenza and pneumonia 0-200,0.799966,0.838555,0.723429,1.0,0.906856,0.936993,0.907624
distinct_patient_count J20-J22 Other acute respiratory infections 0-200,0.854862,0.955434,0.720857,0.906856,1.0,0.914881,0.974302
distinct_patient_count U0 COVID-19 0-200,0.893964,0.889268,0.744777,0.936993,0.914881,1.0,0.955263
summed up infection numbers,0.913771,0.980705,0.708254,0.907624,0.974302,0.955263,1.0


In [29]:
# can read confidence intervall and value from for loop below, can be extended to store values properly
for value in correlations.values.flatten():
    print(value, correlation_confidence_interval(value, 52))

1.0 (nan, nan)
0.9042089985486212 (0.8381230124906167, 0.9441354657297901)
0.6485825406975403 (0.45647810869590627, 0.7829091980360832)
0.799965849910356 (0.6742649412422782, 0.8806176957293164)
0.8548621190130624 (0.7590398302489538, 0.9144325150647521)
0.8939639716554256 (0.8214726214597196, 0.9380216621007996)
0.913771023648937 (0.8537752847114164, 0.949816994693525)
0.9042089985486212 (0.8381230124906167, 0.9441354657297901)
1.0 (nan, nan)
0.6539409556074671 (0.46381303328136375, 0.7864846728246266)
0.8385554512080594 (0.7335139739260153, 0.904474069722681)
0.9554341330145991 (0.923264109711671, 0.9742977363241603)
0.8892683343293776 (0.8138822789434716, 0.9352103080511355)
0.9807051993511483 (0.9664643314544513, 0.9889327400685829)
0.6485825406975403 (0.45647810869590627, 0.7829091980360832)
0.6539409556074671 (0.46381303328136375, 0.7864846728246266)
1.0 (nan, nan)
0.7234288801593979 (0.5613531250948985, 0.8320605083822674)
0.720856841002633 (0.5576606812237952, 0.830399297834861

  z = 0.5 * math.log((1 + r) / (1 - r))
