In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import time
from mlflow.models import infer_signature
mlflow.set_tracking_uri('http://localhost:5001')
mlflow.set_experiment("Covid 19 Deaths Prediction")

class LinearRegressionModel:
    def __init__(self):
        self.model = None
        self.X_train = []
        self.y_train = []
        self.X_valid = []
        self.y_valid = []
        self.X_test_1 = []
        self.X_test_2 = []
        self.y_test_1 = []
        self.y_test_2 = []

    def load_data(self, file):
        df = pd.read_csv(file)
        columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Death_Pct']

        X = df.drop(columns_excluded, axis=1)
        y = df['Death_Pct']

        # One-month prediction
        self.test_1_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-04-30')]
        # Three-month prediction
        self.test_2_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-06-30')]

        self.X_train, self.X_valid, self.y_train, self.y_valid = \
            train_test_split(X, y, test_size=0.2, random_state=42)

        self.X_test_1 = self.test_1_df.drop(columns_excluded, axis=1)
        self.X_test_2 = self.test_2_df.drop(columns_excluded, axis=1)
        self.y_test_1 = self.test_1_df['Death_Pct']
        self.y_test_2 = self.test_2_df['Death_Pct']

    def train(self):
        model = LinearRegression()  # Example: Set normalize parameter to True
            
        model.fit(self.X_train, self.y_train)
        self.model = model

        # Optional: Print coefficients and intercept
        print("Coefficients:", model.coef_)
        print("Intercept:", model.intercept_)

        # Log model parameters and metrics to MLFlow
        mlflow.log_params(model.get_params())
        predictions = model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        mlflow.log_metrics({"RMSE": rmse, "R2": r_squared})
        mlflow.sklearn.log_model(
            sk_model=model,
            signature=infer_signature(self.X_train, model.predict(self.X_train)),
            artifact_path="model",
            input_example=self.X_train,
            registered_model_name="Linear Regression",
            )
    def valid(self):
        predictions = self.model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        
        # Log results in MLflow
        mlflow.log_metric("Validation RMSE", rmse)
        mlflow.log_metric("Validation R2", r_squared)
        
        return rmse, r_squared

    def test_1mo(self):
        predictions = self.model.predict(self.X_test_1)
        self.test_1_df['Predicted_Pct'] = predictions
        # self.test_1_df['Predicted_Deaths'] = predictions * self.test_1_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_1, predictions, squared=False)
        print("1-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_1, predictions)
        print("1-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("1-Month Prediction RMSE", rmse)
        mlflow.log_metric("1-Month Prediction R2", r_squared)

        return self.test_1_df

    def test_3mo(self):
        predictions = self.model.predict(self.X_test_2)
        self.test_2_df['Predicted_Pct'] = predictions
        # self.test_2_df['Predicted_Deaths'] = predictions * self.test_2_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_2, predictions, squared=False)
        print("3-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_2, predictions)
        print("3-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("3-Month Prediction RMSE", rmse)
        mlflow.log_metric("3-Month Prediction R2", r_squared)

        return self.test_2_df


In [None]:
import os
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "Linear Regression")

    model = LinearRegressionModel()
    #use the file name without a path
    model.load_data("merged_data_new.csv")
    model.train()

    print("\nValidation:")
    rmse, r2 = model.valid()
    print(f"RMSE: {rmse}\nR2 = {r2}\n")

    print("1-Month Prediction:")
    df_1mo = model.test_1mo()
    df_1mo = df_1mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct', 'Predicted_Pct']]

    print("3-Month Prediction:")
    start_time = time.time()
    df_3mo = model.test_3mo()
    end = time.time() - start_time
    mlflow.log_metric("3-Month Prediction Time", end)
    df_3mo = df_3mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct', 'Predicted_Pct']]

    df_1mo.to_csv('linear_prediction_1mo.csv', index=False)
    df_3mo.to_csv('linear_prediction_3mo.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

class RandomForestModel:
    def __init__(self):
        self.model = None
        self.X_train = []
        self.y_train = []
        self.X_valid = []
        self.y_valid = []
        self.X_test_1 = []
        self.X_test_2 = []
        self.y_test_1 = []
        self.y_test_2 = []

    def load_data(self, file):
        df = pd.read_csv(file)
        # columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County',
        #                     'Recip_State', 'Census2019', 'Deaths', 'Death_Pct']
        columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Death_Pct']

        X = df.drop(columns_excluded, axis=1)
        y = df['Death_Pct']
        # One-month prediction
        self.test_1_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-04-30')]
        # Three-month prediction
        self.test_2_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-06-30')]

        self.X_train, self.X_valid, self.y_train, self.y_valid = \
            train_test_split(X, y, test_size=0.2, random_state=42)

        self.X_test_1 = self.test_1_df.drop(columns_excluded, axis=1)
        self.X_test_2 = self.test_2_df.drop(columns_excluded, axis=1)
        self.y_test_1 = self.test_1_df['Death_Pct']
        self.y_test_2 = self.test_2_df['Death_Pct']

    def train(self):
            model = RandomForestRegressor(random_state=42,max_depth=5, min_samples_leaf=5)
            model.fit(self.X_train, self.y_train)
            self.model = model
            mlflow.log_params(model.get_params())

            predictions = model.predict(self.X_valid)
            feature_names = self.X_train.columns
            importances = model.feature_importances_
            feature_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
            rmse = mean_squared_error(self.y_valid, predictions, squared=False)
            r_squared = r2_score(self.y_valid, predictions)
            mlflow.log_metrics({"RMSE": rmse, "R2": r_squared})
            mlflow.sklearn.log_model(
                sk_model=model,
                signature=infer_signature(self.X_train, model.predict(self.X_train)),
                artifact_path="model",
                input_example=self.X_train,
                registered_model_name="RandomForestModel-Shallow",
            )
            print(f"{'Top 5 Features':<35} {'Significance':<15}\n")
            for feature in feature_importances[:6]:
                print(f"{feature[0]:<35} {round(feature[1], 3):<15}")

    def valid(self):
        predictions = self.model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        
        # Log results in MLflow
        mlflow.log_metric("Validation RMSE", rmse)
        mlflow.log_metric("Validation R2", r_squared)
        
        return rmse, r_squared

    def test_1mo(self):
        predictions = self.model.predict(self.X_test_1)
        self.test_1_df['Predicted_Pct'] = predictions
        # self.test_1_df['Predicted_Deaths'] = predictions * self.test_1_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_1, predictions, squared=False)
        print("1-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_1, predictions)
        print("1-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("1-Month Prediction RMSE", rmse)
        mlflow.log_metric("1-Month Prediction R2", r_squared)

        return self.test_1_df

    def test_3mo(self):
        predictions = self.model.predict(self.X_test_2)
        self.test_2_df['Predicted_Pct'] = predictions
        # self.test_2_df['Predicted_Deaths'] = predictions * self.test_2_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_2, predictions, squared=False)
        print("3-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_2, predictions)
        print("3-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("3-Month Prediction RMSE", rmse)
        mlflow.log_metric("3-Month Prediction R2", r_squared)

        return self.test_2_df
    
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "RandomForestModel")
    model = RandomForestModel()
    model.load_data('merged_data_new.csv')
    model.train()

    print("\nValidation:")
    rmse, r2 = model.valid()
    print(f"RMSE: {rmse}\nR2 = {r2}\n")

    print("Validation:")
    df_1mo = model.test_1mo()
    df_1mo = df_1mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    start_time = time.time()
    df_3mo = model.test_3mo()
    end = time.time() - start_time
    mlflow.log_metric("3-Month Prediction Time", end)
    df_3mo = df_3mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    df_1mo.to_csv('Random-Forest-Shallow-prediction_1mo.csv', index=False)
    df_3mo.to_csv('Random-Forest-Shallow-prediction_3mo.csv', index=False)




In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

class DecisionTreeModel:
    def __init__(self):
        self.model = None
        self.X_train = []
        self.y_train = []
        self.X_valid = []
        self.y_valid = []
        self.X_test_1 = []
        self.X_test_2 = []
        self.y_test_1 = []
        self.y_test_2 = []

    def load_data(self, file):
        df = pd.read_csv(file)
        # columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County',
        #                     'Recip_State', 'Census2019', 'Deaths', 'Death_Pct']
        columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Death_Pct']

        X = df.drop(columns_excluded, axis=1)
        y = df['Death_Pct']
        # One-month prediction
        self.test_1_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-04-30')]
        # Three-month prediction
        self.test_2_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-06-30')]

        self.X_train, self.X_valid, self.y_train, self.y_valid = \
            train_test_split(X, y, test_size=0.2, random_state=42)

        self.X_test_1 = self.test_1_df.drop(columns_excluded, axis=1)
        self.X_test_2 = self.test_2_df.drop(columns_excluded, axis=1)
        self.y_test_1 = self.test_1_df['Death_Pct']
        self.y_test_2 = self.test_2_df['Death_Pct']

    def train(self):
            model = DecisionTreeRegressor(random_state=42)
            model.fit(self.X_train, self.y_train)
            self.model = model
            mlflow.log_params(model.get_params())

            predictions = model.predict(self.X_valid)
            feature_names = self.X_train.columns
            importances = model.feature_importances_
            feature_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
            rmse = mean_squared_error(self.y_valid, predictions, squared=False)
            r_squared = r2_score(self.y_valid, predictions)
            mlflow.log_metrics({"RMSE": rmse, "R2": r_squared})
            mlflow.sklearn.log_model(
                sk_model=model,
                signature=infer_signature(self.X_train, model.predict(self.X_train)),
                artifact_path="model",
                input_example=self.X_train,
                registered_model_name="DecisionTreeModel",
            )
            print(f"{'Top 5 Features':<35} {'Significance':<15}\n")
            
            for feature in feature_importances[:6]:
                print(f"{feature[0]:<35} {round(feature[1], 3):<15}")

    def valid(self):
        predictions = self.model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        
        # Log results in MLflow
        mlflow.log_metric("Validation RMSE", rmse)
        mlflow.log_metric("Validation R2", r_squared)
        
        return rmse, r_squared

    def test_1mo(self):
        predictions = self.model.predict(self.X_test_1)
        self.test_1_df['Predicted_Pct'] = predictions
        # self.test_1_df['Predicted_Deaths'] = predictions * self.test_1_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_1, predictions, squared=False)
        print("1-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_1, predictions)
        print("1-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("1-Month Prediction RMSE", rmse)
        mlflow.log_metric("1-Month Prediction R2", r_squared)

        return self.test_1_df

    def test_3mo(self):
        predictions = self.model.predict(self.X_test_2)
        self.test_2_df['Predicted_Pct'] = predictions
        # self.test_2_df['Predicted_Deaths'] = predictions * self.test_2_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_2, predictions, squared=False)
        print("3-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_2, predictions)
        print("3-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("3-Month Prediction RMSE", rmse)
        mlflow.log_metric("3-Month Prediction R2", r_squared)

        return self.test_2_df
with mlflow.start_run():

    model = DecisionTreeModel()
    mlflow.set_tag("mlflow.runName", "DecisionTreeModel")

    model.load_data('merged_data_new.csv')
    model.train()

    print("\nValidation:")
    rmse, r2 = model.valid()
    print(f"RMSE: {rmse}\nR2 = {r2}\n")

    print("Validation:")
    df_1mo = model.test_1mo()
    df_1mo = df_1mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    start_time = time.time()
    df_3mo = model.test_3mo()
    end = time.time() - start_time
    mlflow.log_metric("3-Month Prediction Time", end)
    df_3mo = df_3mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    df_1mo.to_csv('Decision-Tree-prediction_1mo.csv', index=False)
    df_3mo.to_csv('Decision-Tree-prediction_3mo.csv', index=False)




In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

class ElasticNetModel:
    def __init__(self):
        self.model = None
        self.X_train = []
        self.y_train = []
        self.X_valid = []
        self.y_valid = []
        self.X_test_1 = []
        self.X_test_2 = []
        self.y_test_1 = []
        self.y_test_2 = []

    def load_data(self, file):
        df = pd.read_csv(file)
        # columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County',
        #                     'Recip_State', 'Census2019', 'Deaths', 'Death_Pct']
        columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Death_Pct']

        X = df.drop(columns_excluded, axis=1)
        y = df['Death_Pct']
        # One-month prediction
        self.test_1_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-04-30')]
        # Three-month prediction
        self.test_2_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-06-30')]

        self.X_train, self.X_valid, self.y_train, self.y_valid = \
            train_test_split(X, y, test_size=0.2, random_state=42)

        self.X_test_1 = self.test_1_df.drop(columns_excluded, axis=1)
        self.X_test_2 = self.test_2_df.drop(columns_excluded, axis=1)
        self.y_test_1 = self.test_1_df['Death_Pct']
        self.y_test_2 = self.test_2_df['Death_Pct']

    def train(self):
            model = ElasticNet()
            model.fit(self.X_train, self.y_train)
            self.model = model
            mlflow.log_params(model.get_params())

            predictions = model.predict(self.X_valid)
            rmse = mean_squared_error(self.y_valid, predictions, squared=False)
            r_squared = r2_score(self.y_valid, predictions)
            mlflow.log_metrics({"RMSE": rmse, "R2": r_squared})
            mlflow.sklearn.log_model(
                sk_model=model,
                signature=infer_signature(self.X_train, model.predict(self.X_train)),
                artifact_path="model",
                input_example=self.X_train,
                registered_model_name="ElasticNet",
            )

    def valid(self):
        predictions = self.model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        
        # Log results in MLflow
        mlflow.log_metric("Validation RMSE", rmse)
        mlflow.log_metric("Validation R2", r_squared)
        
        return rmse, r_squared

    def test_1mo(self):
        predictions = self.model.predict(self.X_test_1)
        self.test_1_df['Predicted_Pct'] = predictions
        # self.test_1_df['Predicted_Deaths'] = predictions * self.test_1_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_1, predictions, squared=False)
        print("1-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_1, predictions)
        print("1-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("1-Month Prediction RMSE", rmse)
        mlflow.log_metric("1-Month Prediction R2", r_squared)

        return self.test_1_df

    def test_3mo(self):
        predictions = self.model.predict(self.X_test_2)
        self.test_2_df['Predicted_Pct'] = predictions
        # self.test_2_df['Predicted_Deaths'] = predictions * self.test_2_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_2, predictions, squared=False)
        print("3-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_2, predictions)
        print("3-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("3-Month Prediction RMSE", rmse)
        mlflow.log_metric("3-Month Prediction R2", r_squared)

        return self.test_2_df
with mlflow.start_run():

    model = ElasticNetModel()
    mlflow.set_tag("mlflow.runName", "ElasticNet")

    model.load_data('merged_data_new.csv')
    model.train()

    print("\nValidation:")
    rmse, r2 = model.valid()
    print(f"RMSE: {rmse}\nR2 = {r2}\n")

    print("Validation:")
    df_1mo = model.test_1mo()
    df_1mo = df_1mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    start_time = time.time()
    df_3mo = model.test_3mo()
    end = time.time() - start_time
    mlflow.log_metric("3-Month Prediction Time", end)
    df_3mo = df_3mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    df_1mo.to_csv('Elastic-Net-prediction_1mo.csv', index=False)
    df_3mo.to_csv('Elastic-Net-prediction_3mo.csv', index=False)




In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
import mlflow.keras
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle
class LSTM_Model:
    def __init__(self):
        self.model = None
        self.data = []
        self.X_valid = []
        self.y_valid = []
        self.X_test = []
        self.y_test = []
        self.look_back = 0
        self.epochs = 0
        self.y_test_label = []

    def load_data(self, file):
        def split_data(data, target_column, train_pct=0.8, validation_pct=0.1, look_back=3):
            X_train = []
            y_train = []
            X_valid = []
            y_valid = []
            X_test = []
            y_test = []
            for data in county_data.values():
                cdata = data.drop(columns=["FIPS", "Date"])
                train_size = int((len(cdata) - 3) * train_pct)
                if train_size <= look_back:
                    continue
                validation_size = len(cdata) - 3 - train_size
                if validation_size == 0 or train_size + validation_size >= len(cdata):
                    continue
                train_sequences = []
                train_labels = []
                valid_sequences = []
                valid_labels = []
                test_sequences = []
                test_labels = []
                test_features = []
                # train data
                for i in range(train_size - look_back):
                    sequence = cdata.iloc[i:(i + look_back)].values
                    label = cdata.iloc[i + look_back][target_column]
                    test_feature = cdata.iloc[i + look_back][cdata.columns != target_column]
                    train_sequences.append(sequence)
                    train_labels.append(label)

                # validation data
                for i in range(train_size - look_back, train_size + validation_size - look_back):
                    sequence = cdata.iloc[i:(i + look_back)].values
                    label = cdata.iloc[i + look_back][target_column]
                    valid_sequences.append(sequence)
                    valid_labels.append(label)

                # test data
                for i in range(train_size + validation_size - look_back, len(cdata) - look_back):
                    sequence = cdata.iloc[i:(i + look_back)].values
                    label = cdata.iloc[i + look_back][target_column]
                    test_sequences.append(sequence)
                    test_labels.append(label)
                    self.y_test_label.append([data.iloc[i + look_back]["FIPS"], data.iloc[i + look_back]["Date"], label])


                X_train.append(np.array(train_sequences))
                y_train.append(np.array(train_labels))
                X_valid.append(np.array(valid_sequences))
                y_valid.append(np.array(valid_labels))
                X_test.append(np.array(test_sequences))
                y_test.append(np.array(test_labels))


            X_train = np.concatenate(X_train)
            y_train = np.concatenate(y_train)
            X_valid = np.concatenate(X_valid)
            y_valid = np.concatenate(y_valid)
            X_test = np.concatenate(X_test)
            y_test = np.concatenate(y_test)

            return X_train, y_train, X_valid, y_valid, X_test, y_test


        df = pd.read_csv(file)
        df = df[df["Date"] <= "2022-06-29"]
        latest_dates = df.groupby('FIPS')['Date'].max()
        counties_to_keep = latest_dates[latest_dates == '2022-06-29'].index
        df = df[df['FIPS'].isin(counties_to_keep)]
        df.sort_values(['FIPS', 'Date'], inplace=True)

        columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Deaths_Pct']
        scaler = MinMaxScaler()
        df_scaled = df.copy()
        df_scaled[df.columns.difference(columns_excluded)] = scaler.fit_transform(df[df.columns.difference(columns_excluded)])

        county_data = {county: grp for county, grp in df_scaled.drop(columns=['MMWR_week', 'Recip_County', 'Recip_State']).groupby("FIPS")}
        self.data = county_data
        
        self.look_back = 3
        target_column = 'Death_Pct'
        self.X_train, self.y_train, self.X_valid, self.y_valid, self.X_test, self.y_test = split_data(self.data, target_column, look_back=self.look_back)


    def train(self, optimizer='adam', loss='mean_squared_error', batch_size=32, epochs=100):
        self.epochs = epochs
        model = Sequential()
        model.add(LSTM(100, return_sequences=True, input_shape=(self.look_back, self.X_train.shape[2])))
        model.add(LSTM(100))
        model.add(Dense(25))
        model.add(Dense(1))  
        model.compile(optimizer=optimizer, loss=loss)
        self.model = model

        self.model.fit(self.X_train, self.y_train, batch_size=batch_size, epochs=epochs, validation_split=0)
        with open("model.pkl", "wb") as f:
                pickle.dump(self.model, f)
        mlflow.log_params({"optimizer": optimizer, "loss": loss, "batch_size": batch_size, "epochs": epochs})
        mlflow.keras.log_model(
            model=model,
            artifact_path="model",
            registered_model_name="LSTM",
        )



    
    def valid(self):
        predictions = self.model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        return rmse, r_squared
    
    def test(self):
        predictions = self.model.predict(self.X_test)
        for i in range(len(predictions)):
            prediction = predictions[i]
            self.y_test_label[i].extend(prediction)
        df = pd.DataFrame(self.y_test_label, columns=["FIPS", "Date", "y", "y_predict"])
        df.to_csv("prediction.csv")

        np.save(f"y_test_epoch_{self.epochs}.npy", predictions)
        rmse = mean_squared_error(self.y_test, predictions, squared=False)
        r_squared = r2_score(self.y_test, predictions)
        return rmse, r_squared

    

with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "LSTM")
    model = LSTM_Model()
    model.load_data("./merged_data_new.csv")
    model.train(epochs=100)
    start = time.time()
    rmse, r2 = model.test()
    end = time.time() - start
    mlflow.log_metric("3-Month Prediction Time", end)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)
    print(f"rmse={rmse}\n, r2={r2}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

class RandomForestModel:
    def __init__(self):
        self.model = None
        self.X_train = []
        self.y_train = []
        self.X_valid = []
        self.y_valid = []
        self.X_test_1 = []
        self.X_test_2 = []
        self.y_test_1 = []
        self.y_test_2 = []

    def load_data(self, file):
        df = pd.read_csv(file)
        # columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County',
        #                     'Recip_State', 'Census2019', 'Deaths', 'Death_Pct']
        columns_excluded = ['Date', 'FIPS', 'MMWR_week', 'Recip_County', 'Recip_State', 'Death_Pct']

        X = df.drop(columns_excluded, axis=1)
        y = df['Death_Pct']
        # One-month prediction
        self.test_1_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-04-30')]
        # Three-month prediction
        self.test_2_df = df[(df['Date'] >= '2022-04-01') & (df['Date'] <= '2022-06-30')]

        self.X_train, self.X_valid, self.y_train, self.y_valid = \
            train_test_split(X, y, test_size=0.2, random_state=42)

        self.X_test_1 = self.test_1_df.drop(columns_excluded, axis=1)
        self.X_test_2 = self.test_2_df.drop(columns_excluded, axis=1)
        self.y_test_1 = self.test_1_df['Death_Pct']
        self.y_test_2 = self.test_2_df['Death_Pct']

    def train(self):
            model = RandomForestRegressor()
            rs_space={'max_depth':list(np.arange(10, 100, step=10)) + [None],
              'n_estimators':np.arange(10, 500, step=50),
              'max_features':randint(18,43),
              'criterion':['squared_error', 'poisson', 'absolute_error', 'friedman_mse'],
              'min_samples_leaf':randint(1,4),
              'min_samples_split':np.arange(2, 10, step=2)
         }
            rf_random = RandomizedSearchCV(model, rs_space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=3)
            model_random = rf_random.fit(self.X_train,self.y_train)
            print('Best random search hyperparameters are: '+str(model_random.best_params_))
            print('Best random search score is: '+str(model_random.best_score_))
            model = RandomForestRegressor(params=model_random.best_params_)
            model.fit(self.X_train, self.y_train)
            self.model = model
            mlflow.log_params(model.get_params())

            predictions = model.predict(self.X_valid)
            feature_names = self.X_train.columns
            importances = model.feature_importances_
            feature_importances = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
            rmse = mean_squared_error(self.y_valid, predictions, squared=False)
            r_squared = r2_score(self.y_valid, predictions)
            mlflow.log_metrics({"RMSE": rmse, "R2": r_squared})
            mlflow.sklearn.log_model(
                sk_model=model,
                signature=infer_signature(self.X_train, model.predict(self.X_train)),
                artifact_path="model",
                input_example=self.X_train,
                registered_model_name="RandomForestModel-Tuned",
            )
            print(f"{'Top 5 Features':<35} {'Significance':<15}\n")
            for feature in feature_importances[:6]:
                print(f"{feature[0]:<35} {round(feature[1], 3):<15}")

    def valid(self):
        predictions = self.model.predict(self.X_valid)
        rmse = mean_squared_error(self.y_valid, predictions, squared=False)
        r_squared = r2_score(self.y_valid, predictions)
        
        # Log results in MLflow
        mlflow.log_metric("Validation RMSE", rmse)
        mlflow.log_metric("Validation R2", r_squared)
        
        return rmse, r_squared

    def test_1mo(self):
        predictions = self.model.predict(self.X_test_1)
        self.test_1_df['Predicted_Pct'] = predictions
        # self.test_1_df['Predicted_Deaths'] = predictions * self.test_1_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_1, predictions, squared=False)
        print("1-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_1, predictions)
        print("1-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("1-Month Prediction RMSE", rmse)
        mlflow.log_metric("1-Month Prediction R2", r_squared)

        return self.test_1_df

    def test_3mo(self):
        predictions = self.model.predict(self.X_test_2)
        self.test_2_df['Predicted_Pct'] = predictions
        # self.test_2_df['Predicted_Deaths'] = predictions * self.test_2_df['Census2019'] * 0.01

        rmse = mean_squared_error(self.y_test_2, predictions, squared=False)
        print("3-Month Prediction RMSE: ", rmse)
        r_squared = r2_score(self.y_test_2, predictions)
        print("3-Month Prediction R2: ", r_squared)

        # Log results in MLflow
        mlflow.log_metric("3-Month Prediction RMSE", rmse)
        mlflow.log_metric("3-Month Prediction R2", r_squared)

        return self.test_2_df
    
with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "RandomForestModel Tuned")
    model = RandomForestModel()
    model.load_data('merged_data_new.csv')
    model.train()

    print("\nValidation:")
    rmse, r2 = model.valid()
    print(f"RMSE: {rmse}\nR2 = {r2}\n")

    print("Validation:")
    df_1mo = model.test_1mo()
    df_1mo = df_1mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    start_time = time.time()
    df_3mo = model.test_3mo()
    end = time.time() - start_time
    mlflow.log_metric("3-Month Prediction Time", end)
    df_3mo = df_3mo[['Date', 'FIPS', 'Recip_County', 'Recip_State', 'Death_Pct',
                        'Predicted_Pct']] #, 'Deaths', 'Predicted_Deaths']]

    df_1mo.to_csv('Random-Forest-Tuned-prediction_1mo.csv', index=False)
    df_3mo.to_csv('Random-Forest-Tuned-prediction_3mo.csv', index=False)




In [None]:
df = pd.read_csv('merged_data_new.csv')

df.shape