# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample


# Seed

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

# Modeling

In [None]:

# Updated data based on the provided values
data = {
    "Days or Full": [
        "Full data",
        "30-day-truncated-data",
        "60-day-truncated-data",
        "90-day-truncated-data"
    ],
    "Accuracy": [0.617, 0.676, 0.630, 0.628],
    "Precision": [0.717, 0.688, 0.689, 0.726],
    "Recall": [0.727, 0.960, 0.835, 0.732],
    "F1 Score": [0.722, 0.802, 0.755, 0.729],
    "ROC AUC": [0.611, 0.599, 0.582, 0.605],
    #"Specificity": [0.381, 0.063, 0.188, 0.404],
    #"NPV": [0.393, 0.423, 0.346, 0.412]
}

# Creating a DataFrame
df = pd.DataFrame(data)

# Mapping textual representation of days to numerical values with Full data first
day_mapping = {
    "Full data": 1,
    "30-day-truncated-data": 2,
    "60-day-truncated-data": 3,
    "90-day-truncated-data": 4
}
df['Days or Full'] = df['Days or Full'].map(day_mapping)

# Sorting the DataFrame to ensure Full data is plotted first
df = df.sort_values(by='Days or Full').reset_index(drop=True)

# Preparing the data for linear regression
X = df['Days or Full'].values.reshape(-1, 1)

# Dictionary to store models and predictions
models = {}
predictions = {}

# Fit linear regression models and make predictions for each metric
for metric in ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', ]:
    y = df[metric].values
    model = LinearRegression()
    model.fit(X, y)
    models[metric] = model
    predictions[metric] = model.predict(X)

# Plotting the data and the linear regression fits
plt.figure(figsize=(14, 8))

colors = ['purple', 'blue', 'cyan', 'green', 'red', 'orange', 'magenta']
for i, metric in enumerate(['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC',]):
    plt.plot(df['Days or Full'], df[metric], marker='o', label=f'{metric} Data Points', color=colors[i])
    plt.plot(df['Days or Full'], predictions[metric], label=f'{metric} Linear Fit', color=colors[i], linestyle='dashed')

plt.xlabel('Days or Full')
plt.ylabel('Metric Values')
plt.title('Trends of Different Metrics over Days or Full Data with Linear Regression Fits')

# Setting custom tick labels in the desired order with Full data first
plt.xticks([1, 2, 3, 4], ['Full data', '30 days', '60 days', '90 days'])

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.show()


In [None]:


# Data
data = {
    "Days or Full": [
        "Full data",
        "30-day-truncated-data",
        "60-day-truncated-data",
        "90-day-truncated-data"
    ],
    "Accuracy": [0.617, 0.676, 0.630, 0.628],
    "Precision": [0.717, 0.688, 0.689, 0.726],
    "Recall": [0.727, 0.960, 0.835, 0.732],
    "F1 Score": [0.722, 0.802, 0.755, 0.729],
    "ROC AUC": [0.611, 0.599, 0.582, 0.605],
    "Specificity": [0.381, 0.063, 0.188, 0.404],
    "NPV": [0.393, 0.423, 0.346, 0.412]
}

# Creating a DataFrame
df = pd.DataFrame(data)

# Mapping textual representation of days to numerical values with Full data first
day_mapping = {
    "Full data": 1,
    "30-day-truncated-data": 2,
    "60-day-truncated-data": 3,
    "90-day-truncated-data": 4
}
df['Days or Full'] = df['Days or Full'].map(day_mapping)

# Sorting the DataFrame to ensure Full data is first
df = df.sort_values(by='Days or Full').reset_index(drop=True)

# Number of bootstrap samples
n_iterations = 1000

# Initialize a dictionary to store bootstrap results for all metrics
bootstrap_results = {}

# Metrics to analyze
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Specificity', 'NPV']

# Bootstrapping process for each metric
for metric in metrics:
    X = df['Days or Full'].values.reshape(-1, 1)
    y = df[metric].values
    bootstrap_coefs = []

    for _ in range(n_iterations):
        # Resample with replacement
        X_resampled, y_resampled = resample(X, y)

        # Fit the model on the resampled data
        model = LinearRegression()
        model.fit(X_resampled, y_resampled)

        # Store the coefficient (slope)
        bootstrap_coefs.append(model.coef_[0])

    # Convert to a numpy array and store in the dictionary
    bootstrap_results[metric] = np.array(bootstrap_coefs)

    # Calculate the 95% confidence interval for the slope
    ci_lower = np.percentile(bootstrap_results[metric], 2.5)
    ci_upper = np.percentile(bootstrap_results[metric], 97.5)

    # Plotting the distribution of bootstrapped slopes
    plt.figure(figsize=(8, 4))
    plt.hist(bootstrap_results[metric], bins=30, color='blue', edgecolor='black')
    plt.axvline(x=ci_lower, color='red', linestyle='--', label=f'95% CI Lower: {ci_lower:.4f}')
    plt.axvline(x=ci_upper, color='green', linestyle='--', label=f'95% CI Upper: {ci_upper:.4f}')
    plt.title(f'Bootstrap Distribution of Slope Coefficients for {metric}')
    plt.xlabel('Slope Coefficient')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

    print(f"95% Confidence Interval for the slope of {metric}: [{ci_lower:.4f}, {ci_upper:.4f}]")
