In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV


In [2]:
df = pd.read_csv('COVID-19_Vaccination_Age_and_Sex_Trends_in_the_United_States__National_and_Jurisdictional.csv')

In [3]:
df

Unnamed: 0,Date,Location,Demographic_Category,census,Administered_Dose1,Series_Complete_Yes,Booster_Doses,Second_Booster,Administered_Dose1_pct_agegroup,Series_Complete_Pop_pct_agegroup,Booster_Doses_Vax_pct_agegroup,Second_Booster_Vax_pct_agegroup
0,12/07/2022 12:00:00 AM,SD,Female_Ages_25-39_yrs,82026.0,70565.0,54893.0,19864.0,,86.0,66.9,36.2,
1,12/07/2022 12:00:00 AM,WI,Female_Ages_25-39_yrs,545614.0,425661.0,380299.0,194014.0,,78.0,69.7,51.0,
2,12/07/2022 12:00:00 AM,IN,Sex_Female,3411859.0,2257339.0,2040656.0,1069660.0,296972.0,66.2,59.8,52.5,43.3
3,12/07/2022 12:00:00 AM,UT,Sex_Female,1591041.0,1208999.0,1081849.0,556474.0,135224.0,76.0,68.0,51.6,49.9
4,12/07/2022 12:00:00 AM,RI,Female_Ages_65+_yrs,105649.0,127869.0,109840.0,89614.0,55490.0,95.0,95.0,81.6,61.9
...,...,...,...,...,...,...,...,...,...,...,...,...
1870495,12/13/2020 12:00:00 AM,VA,Female_Ages_<2yrs,96647.0,,,,,,,,
1870496,12/13/2020 12:00:00 AM,OK,Male_Ages_<2yrs,50289.0,,,,,,,,
1870497,12/13/2020 12:00:00 AM,IL,Male_Ages_<2yrs,147954.0,,,,,,,,
1870498,12/13/2020 12:00:00 AM,ME,Male_Ages_65-74_yrs,80965.0,35.0,11.0,,,0.0,0.0,,


In [4]:
#Re-inspect the columns once again
df.columns

Index(['Date', 'Location', 'Demographic_Category', 'census',
       'Administered_Dose1', 'Series_Complete_Yes', 'Booster_Doses',
       'Second_Booster', 'Administered_Dose1_pct_agegroup',
       'Series_Complete_Pop_pct_agegroup', 'Booster_Doses_Vax_pct_agegroup',
       'Second_Booster_Vax_pct_agegroup'],
      dtype='object')

In [5]:
#Check the data types of each columns
df.dtypes

Date                                 object
Location                             object
Demographic_Category                 object
census                              float64
Administered_Dose1                  float64
Series_Complete_Yes                 float64
Booster_Doses                       float64
Second_Booster                      float64
Administered_Dose1_pct_agegroup     float64
Series_Complete_Pop_pct_agegroup    float64
Booster_Doses_Vax_pct_agegroup      float64
Second_Booster_Vax_pct_agegroup     float64
dtype: object

In [6]:
df['Date'] = pd.to_numeric(df['Date'],errors='coerce')

In [7]:
# Check if there are any missing data
df.isna().sum()

Date                                1870500
Location                                  0
Demographic_Category                      0
census                                87000
Administered_Dose1                   341559
Series_Complete_Yes                  410480
Booster_Doses                       1003015
Second_Booster                      1492490
Administered_Dose1_pct_agegroup      410142
Series_Complete_Pop_pct_agegroup     465713
Booster_Doses_Vax_pct_agegroup      1003015
Second_Booster_Vax_pct_agegroup     1492490
dtype: int64

# Modeling 
### Using the pipeline that encompasses standardscaler, polynomialfeatures, RFE and Ridge regression

In [22]:
df.dropna(subset = 'Series_Complete_Yes', inplace = True)

In [23]:
# Selecting the features to be put into the model
features = df[[ 'census', 'Administered_Dose1', 'Booster_Doses', 'Second_Booster',
                       'Administered_Dose1_pct_agegroup', 'Series_Complete_Pop_pct_agegroup', 'Booster_Doses_Vax_pct_agegroup',
                       'Second_Booster_Vax_pct_agegroup']]

#Set our X (features to be considered) and y (The target column that we want to predict)
X = features
y = df['Series_Complete_Yes']

In [24]:
pipe = Pipeline([
    ('ss',StandardScaler()),
    ('polynomial', PolynomialFeatures()),
    ('rfe', RFE(Ridge())),
    ('ridge', Ridge(max_iter=10_000))
])

In [25]:
# Set up the pipeline
#pipe = make_pipeline(StandardScaler(),PolynomialFeatures(), RFE(Ridge()), Ridge(max_iter=10_000))

In [26]:
#Train, test split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state=42)

# Impute missing values

In [27]:
#Fill in the missing values in the training set
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer(missing_values = np.nan, strategy ='constant', fill_value=0)
 
# Fitting the data to the imputer object
imputed_X = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)

In [28]:
#Check the train.shape one more time
y_train.shape

(978213,)

In [29]:
np.isnan(y_train).values.sum()

0

In [30]:
imputed_X = pd.DataFrame(imputed_X, columns = X_train.columns)

In [31]:
imputed_X.isna().sum()

census                              0
Administered_Dose1                  0
Booster_Doses                       0
Second_Booster                      0
Administered_Dose1_pct_agegroup     0
Series_Complete_Pop_pct_agegroup    0
Booster_Doses_Vax_pct_agegroup      0
Second_Booster_Vax_pct_agegroup     0
dtype: int64

In [32]:
imputed_X.describe()

Unnamed: 0,census,Administered_Dose1,Booster_Doses,Second_Booster,Administered_Dose1_pct_agegroup,Series_Complete_Pop_pct_agegroup,Booster_Doses_Vax_pct_agegroup,Second_Booster_Vax_pct_agegroup
count,978213.0,978213.0,978213.0,978213.0,978213.0,978213.0,978213.0,978213.0
mean,1284219.0,833247.6,228275.7,30132.92,60.237662,52.53818,22.772254,4.814985
std,7088527.0,4857733.0,1714938.0,369922.4,33.128321,32.336799,26.709765,12.627493
min,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,75500.0,22711.0,0.0,0.0,35.1,23.1,0.0,0.0
50%,268860.0,136289.0,2561.0,0.0,70.0,59.9,7.9,0.0
75%,687423.0,445349.0,88662.0,27.0,90.6,80.8,44.0,0.2
max,168546300.0,138390600.0,62660480.0,21090180.0,95.0,95.0,91.0,74.2


# Check if there are infinity values

In [33]:
imputed_X = imputed_X.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [34]:
#Check the test.shape one more time
imputed_X_test.shape

(481807, 8)

# Fit the pipeline into the data

In [35]:
#Fit the pipe into our data
pipe.fit(imputed_X, y_train)

Pipeline(steps=[('ss', StandardScaler()), ('polynomial', PolynomialFeatures()),
                ('rfe', RFE(estimator=Ridge())),
                ('ridge', Ridge(max_iter=10000))])

# The coefficients

In [38]:
pipe.named_steps['ridge'].coef_

array([ 9.76791415e+05,  2.26401095e+06,  1.53724477e+06, -1.96768517e+05,
        3.24847212e+05, -1.72045961e+05,  9.23959411e+02, -1.54352852e+06,
        2.30234324e+06,  4.92324888e+04, -6.66245707e+02,  2.30503268e+04,
       -2.37947015e+04, -1.06740581e+06,  6.44460564e+05, -7.08037673e+05,
        1.14014438e+04, -4.78205471e+04,  5.49356768e+04, -2.16613331e+03,
       -5.14916033e+03,  4.43383945e+03])

# Model Evaluation / Metrics

In [45]:
pipe.score(imputed_X, y_train)

0.9999964621922691

In [47]:
pipe.score(imputed_X_test, y_test)



0.9999963999692836

In [None]:
# For every standard deviation increases in the column, the target increases by the coefficients in value. 

# Prediction of y

In [48]:
#y_pred = pipe.predict(imputed_X)
y_pred = pipe.predict(imputed_X)

In [49]:
y_pred

array([274069.47193028,   4965.88553539,   4092.90534251, ...,
        53181.86107163, 107399.47391694,    386.39395037])

# Dummy Regressor -> Find out the Baseline Mean Square Error

In [50]:
from sklearn.dummy import DummyRegressor

In [52]:
#lr = LinearRegression().fit(imputed_X, y_train)
lr = LinearRegression().fit(imputed_X, y_train)

In [53]:
lr_dummy_mean = DummyRegressor(strategy = 'mean').fit(imputed_X, y_train)
 
lr_dummy_median = DummyRegressor(strategy = 'median').fit(imputed_X, y_train)


In [55]:
#y_predict = lr.predict(imputed_X_test)
y_predict = lr.predict(imputed_X_test)



In [56]:
y_predict_dummy_mean = lr_dummy_mean.predict(imputed_X_test)
y_predict_dummy_median = lr_dummy_median.predict(imputed_X_test)


In [57]:
print('Linear model, coefficients: ', lr.coef_)
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, y_predict_dummy_mean, squared=False)))

Linear model, coefficients:  [-5.38175526e-02  9.41706190e-01 -3.96130598e-02  5.16513508e-02
 -6.02510092e+03  6.38197057e+03 -1.49198670e+02 -5.39140102e+02]
Mean squared error (dummy): 4168354.72


# Root Mean Squared Error of Our Pipeline

In [58]:
# Training RMSE
print('train:', mean_squared_error(y_train, pipe.predict(imputed_X), squared=False))

# Testing RMSE
print('test:', mean_squared_error(y_test, pipe.predict(imputed_X_test), squared=False))

train: 7864.034843923834




test: 7908.930523191321


# Cross val score

In [59]:
scores = cross_val_score(estimator=pipe, X=imputed_X, y=y_train, cv=5)
scores

array([0.9999964 , 0.99999651, 0.99999628, 0.99999673, 0.99999628])

In [60]:
# Generting cross validated predictions 
scores[:5]

array([0.9999964 , 0.99999651, 0.99999628, 0.99999673, 0.99999628])

In [61]:
# get the standard deviation from cross validated testing score
scores.std()

1.657324108579051e-07

In [62]:
# Running cross validation on the training subset
# cv changes the number of folds
cross_val_score(pipe, imputed_X, y_train, cv=7).mean()

0.9999964442743714

# R2 score

In [63]:
from sklearn import metrics

print('r2 score of the ridge regression: ',metrics.r2_score(y_train, y_pred))

print('mean squared error on the training: ', metrics.mean_squared_error(y_train, y_pred,squared=False))

r2 score of the ridge regression:  0.9999964621922691
mean squared error on the training:  7864.034843923834


# Baseline for y

In [64]:
# Baseline
y.mean()

706570.0009369735