In [4]:
!pip install Gpy
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = "/content/drive/MyDrive/updated_dataset_with_vaccine_change.csv"  # Replace with your dataset file path
data = pd.read_csv(file_path)

# Drop rows with NA/empty values
data = data.dropna()

# Filter for a specific county (e.g., "Alameda")
county_name = "Butte"
county_data = data[data['county'] == county_name]

# Add sinusoidal features for seasonality
county_data['sin_days'] = np.sin(2 * np.pi * county_data['days_since_zero'] / 365.25)
county_data['cos_days'] = np.cos(2 * np.pi * county_data['days_since_zero'] / 365.25)

# Define features and target
features = [
    'daily_change_per_100k', 'population', 'latitude', 'longitude',
    'closest_2_county_population', 'population_density', 'mobility_index',
    'total_facility_bed', 'change_in_doses', 'sin_days', 'cos_days'
]
target = 'daily_change_per_100k'

X = county_data[features]
y = county_data[target]

# Initialize the 5-fold rolling time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
mae_scores = []
rmse_scores = []
mape_scores = []
r2_scores = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Define Gaussian Process kernel
    kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=1)
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, normalize_y=True)

    # Fit the model
    gp.fit(X_train, y_train)

    # Make predictions
    y_pred = gp.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

# Handle zeros in y_test for MAPE calculation
non_zero_indices = y_test != 0
if non_zero_indices.any():  # Ensure there are non-zero values
    mape = np.mean(np.abs((y_test[non_zero_indices] - y_pred[non_zero_indices]) / y_test[non_zero_indices])) * 100
else:
    mape = np.nan  # Set MAPE to NaN if all true values are zero

r2 = r2_score(y_test, y_pred)

mae_scores.append(mae)
rmse_scores.append(rmse)
if not np.isnan(mape):  # Only include valid MAPE values
    mape_scores.append(mape)
r2_scores.append(r2)

# Calculate overall metrics
overall_mae = np.mean(mae_scores)
overall_rmse = np.mean(rmse_scores)
overall_mape = np.mean(mape_scores)
overall_r2 = np.mean(r2_scores)

print(f"Overall MAE: {overall_mae}")
print(f"Overall RMSE: {overall_rmse}")
print(f"Overall MAPE: {overall_mape}%")
print(f"Overall R-squared: {overall_r2}")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


Overall MAE: 0.048282821477309876
Overall RMSE: 0.0033585805397780694
Overall MAPE: 0.5765073694197728%
Overall R-squared: 0.9999992020513464




In [12]:
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/updated_dataset_with_vaccine_change.csv")

# Add a new column for the prediction date
data['prediction_date'] = pd.to_datetime(data['date']) + pd.to_timedelta(30, unit='D')
data['sin_days'] = np.sin(2 * np.pi * data['days_since_zero'] / 365.25)
data['cos_days'] = np.cos(2 * np.pi * data['days_since_zero'] / 365.25)
# Prepare the features and target variable
features = [
    'daily_change_per_100k', 'population', 'latitude', 'longitude',
    'closest_2_county_population', 'population_density', 'mobility_index',
    'total_facility_bed', 'change_in_doses', 'sin_days', 'cos_days'
]
target = 'daily_change_per_100k'

# Store predictions
predictions = []

# Process each county
for county in data['county'].unique():
    county_data = data[data['county'] == county]
    county_data = county_data.sort_values(by='date')

    # Drop rows with missing target or feature values
    county_data = county_data.dropna(subset=[target] + features)

    # Check if there's enough data for training and testing
    if len(county_data) < 2:
        print(f"Skipping county {county} due to insufficient data.")
        continue

    # Split the data into training (first half) and testing (full dataset)
    split_index = len(county_data) // 2
    train_data = county_data.iloc[:split_index]
    test_data = county_data

    # Check if training data has enough samples
    if len(train_data) < 1:
        print(f"Skipping county {county} due to insufficient training data.")
        continue

    # Separate features and target
    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]

    # Fit Gaussian Process model
    kernel = C(1.0, (1e-4, 1e4)) * RBF(1.0, (1e-4, 1e4))
    gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)

    try:
        gpr.fit(X_train, y_train)

        # Predict on test data
        y_pred = gpr.predict(X_test)

        # Store predictions
        test_data['predicted_daily_change_per_100k'] = y_pred
        predictions.append(test_data)
    except Exception as e:
        print(f"Error processing county {county}: {e}")

# Combine all predictions
if predictions:
    predicted_data = pd.concat(predictions)
    # Save the updated dataset
    predicted_data.to_csv("/content/drive/MyDrive/updated_dataset_with_predictions.csv", index=False)
    print("Predictions saved to updated_dataset_with_predictions.csv")
else:
    print("No predictions were made due to insufficient data in all counties.")


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html


Skipping county Alpine due to insufficient data.


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

Predictions saved to updated_dataset_with_predictions.csv
