In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from utils import correlation_confidence_interval

# hospitalization

In [None]:
cov_hosp = pd.read_csv("../data/COVID_hosp.csv")
cov_hosp = cov_hosp[cov_hosp['geography'] == 'DE']
cov_hosp = cov_hosp[['date', 'total']].rename(columns={'total': 'hosp'})
cov_hosp.set_index('date', inplace=True)
cov_hosp['hosp'] = cov_hosp['hosp'] / (84357 / 100)

# viral load

In [None]:
# Read the Excel file into a DataFrame
cov_vir = pd.read_excel("../data/amelag_aggregierte_kurve.xlsx")
cov_vir = cov_vir[['datum', 'viruslast']].dropna()
cov_vir = cov_vir.rename(columns={'datum': 'date', 'viruslast': 'viral load'})
cov_vir['date'] = cov_vir['date'].dt.date.astype('str')
cov_vir['date'] = cov_vir['date'].replace(" 00:00:00", "")
cov_vir.set_index('date', inplace=True)

# Read the Excel file into a DataFrame
cov_vir_loess = pd.read_excel("../data/amelag_aggregierte_kurve.xlsx")
cov_vir_loess = cov_vir_loess[['datum', 'loess_vorhersage']].dropna()
cov_vir_loess = cov_vir_loess.rename(columns={'datum': 'date', 'loess_vorhersage': 'viral load (loess)'})
cov_vir_loess['date'] = cov_vir_loess['date'].dt.date.astype('str')
cov_vir_loess['date'] = cov_vir_loess['date'].replace(" 00:00:00", "")
cov_vir_loess.set_index('date', inplace=True)

# combined dataframe

In [None]:
# Combine dataframes based on index (date)
combined_df = pd.concat([cov_hosp, cov_vir, cov_vir_loess], axis=1)

In [None]:
#combined_df = combined_df[combined_df['viral load (loess)'].notna()]
combined_df = combined_df.rename(columns={'hosp': 'hospitalization incidence'})

# correlation

In [None]:
combined_df = combined_df.reset_index(drop=True)
combined_df.corr(method='spearman')

In [None]:
# Define the columns of interest
column1 = 'viral load (loess)'
column2 = 'hospitalization incidence'

# Initialize variables to store the maximum absolute correlation and corresponding shift value
max_abs_corr = 0
best_shift = 0

# Loop through shift values from -7 to 14
corrs=[]
CIs = []
for shift in range(-7,15):  # Shift values 
    # Create lagged versions of the columns
    lagged_column1 = combined_df[column1].shift(shift).dropna()
    lagged_column2 = combined_df[column2].dropna()
    # Calculate the correlation between the lagged columns
    corr = lagged_column1.corr(lagged_column2,method='spearman')
    #print(shift,corr)
    corrs.append(corr)
    CIs.append(correlation_confidence_interval(corr,len(lagged_column1)))
    # Update max_abs_corr and best_shift if the absolute correlation is higher
    if abs(corr) > max_abs_corr:
        max_abs_corr = abs(corr)
        best_shift = shift


# Display the maximum absolute correlation and corresponding shift value
print("Maximum absolute correlation:", max_abs_corr)
print("Corresponding shift value:", best_shift)


# Example data

# Calculate the error margins
yerr = [[corr - lower for corr, (lower, upper) in zip(corrs, CIs)],  # Lower errors
        [upper - corr for corr, (lower, upper) in zip(corrs, CIs)]]  # Upper errors

# Plot
plt.figure(figsize=(10, 6))
plt.errorbar(range(-7, 15), corrs, yerr=yerr, fmt='o', linestyle='-', markersize=4)

# Add title and labels
plt.title('Spearman Correlation with 95% Confidence Interval',fontsize=16)
plt.xlabel('Lag Value (days)',fontsize=16)
plt.ylabel('Spearman Correlation',fontsize=16)

# Get current axes and set integer x-ticks
ax = plt.gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Add grid
plt.grid(True)

# Show plot
plt.savefig("../output/lagged_corellation.png")
plt.show()

