In [None]:
import pandas as pd
import logging
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Load final dataset with features in chunks
def load_data_in_chunks(file_path, chunksize=10000):
    logging.info(f"Loading data in chunks from {file_path}")
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        yield chunk

# Train model on each chunk
def train_model(chunk):
    logging.info("Training model")
    X = chunk.drop(columns=['NewCases_7day_avg'])
    y = chunk['NewCases_7day_avg']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    joblib.dump(model, 'models/random_forest_model.pkl')
    return model, X_test, y_test

# Evaluate model on each chunk
def evaluate_model(model, X_test, y_test):
    logging.info("Evaluating model")
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

# Modeling and evaluation in batches
def model_and_evaluate_in_batches(input_file, chunksize=10000):
    chunk_iter = load_data_in_chunks(input_file, chunksize)
    for chunk in tqdm(chunk_iter, desc="Modeling and evaluating"):
        model, X_test, y_test = train_model(chunk)
        evaluate_model(model, X_test, y_test)

# Modeling and evaluation on final dataset with features
logging.info("Starting modeling and evaluation on final dataset with features.")
input_file = "datasets/final_dataset_with_features.csv"
model_and_evaluate_in_batches(input_file)
logging.info("Finished modeling and evaluation on final dataset with features.")
