In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from xgboost import XGBRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Read in the cleaned csv
cleaned_df = pd.read_csv('Data/df2.csv')

In [3]:
#Inspect the columns and rows
cleaned_df

Unnamed: 0.1,Unnamed: 0,Date,Location,Demographic_Category,census,Not_Vaccinated,Administered_Dose1,Series_Complete_Yes
0,0,2021-12-30,1,1,52055.0,3116.0,48939.0,42161.0
1,1,2021-12-30,2,1,481323.0,123150.0,358173.0,262023.0
2,2,2021-12-30,35,26,135005.0,103698.0,31307.0,20149.0
3,3,2021-12-30,36,26,133254.0,120857.0,12397.0,6750.0
4,4,2021-12-30,37,28,526217.0,105711.0,420506.0,358267.0
...,...,...,...,...,...,...,...,...
77395,77395,2021-12-01,45,11,10184.0,10184.0,0.0,0.0
77396,77396,2021-12-01,17,46,3005.0,795.0,2210.0,2075.0
77397,77397,2021-12-01,50,46,6524.0,1830.0,4694.0,4018.0
77398,77398,2021-12-01,14,9,40789796.0,8044258.0,32745538.0,29013930.0


In [4]:
# Drop unneccessary column
cleaned_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [5]:
#Re-inspect the columns once again
cleaned_df.columns

Index(['Date', 'Location', 'Demographic_Category', 'census', 'Not_Vaccinated',
       'Administered_Dose1', 'Series_Complete_Yes'],
      dtype='object')

In [6]:
#Check the data types of each column
cleaned_df.dtypes

Date                     object
Location                  int64
Demographic_Category      int64
census                  float64
Not_Vaccinated          float64
Administered_Dose1      float64
Series_Complete_Yes     float64
dtype: object

In [7]:
# Transforming the Date column from object to float
cleaned_df['Date'] = pd.to_numeric(cleaned_df['Date'],errors='coerce')

In [8]:
# Check if there are any missing data
cleaned_df.isna().sum()

Date                    77400
Location                    0
Demographic_Category        0
census                   3600
Not_Vaccinated           3600
Administered_Dose1          0
Series_Complete_Yes         0
dtype: int64

In [9]:
#Check the data types again
cleaned_df.dtypes

Date                    float64
Location                  int64
Demographic_Category      int64
census                  float64
Not_Vaccinated          float64
Administered_Dose1      float64
Series_Complete_Yes     float64
dtype: object

In [10]:
#Check the shape again
cleaned_df.shape

(77400, 7)

# Modeling 
### Using the pipeline that encompasses standardscaler, polynomialfeatures, RFE and Ridge regression

In [46]:
# Selecting the features to be put into the model
features = cleaned_df[['census','Location','Demographic_Category','Not_Vaccinated','Date']]

#Set our X (features to be considered) and y (The target column that we want to predict)
X = features
y = cleaned_df['Series_Complete_Yes']

In [47]:
# Set up the pipeline
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(), RFE(Ridge()), Ridge(max_iter=10_000))

In [48]:
#Train, test split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.33, random_state=42)

In [49]:
#Double check to make sure if there are any missing values
X.isna().sum()

census                  3600
Location                   0
Demographic_Category       0
Not_Vaccinated             0
Date                       0
dtype: int64

In [50]:
cleaned_df['Not_Vaccinated'].replace({np.nan:0},inplace=True) 

In [51]:
cleaned_df['Date'].replace({np.nan:0},inplace=True) 

In [52]:
#Fill in the missing values in the training set
my_imputer = SimpleImputer(missing_values = np.nan, strategy ='constant', fill_value=0)
 
# Fitting the data to the imputer object
imputed_X = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)

In [53]:
#Double check to make sure if there are any missing values
y.isna().sum()

0

In [54]:
#Check the train.shape one more time
imputed_X.shape

(51858, 5)

In [55]:
#Check the test.shape one more time
imputed_X_test.shape

(25542, 5)

In [56]:
#Fit the pipe into our data
pipe.fit(imputed_X, y_train)

# Model Evalaution / Metrics

In [57]:
# Training score
pipe.score(imputed_X, y_train)

0.9997409821159169

In [58]:
#Testing score
pipe.score(imputed_X_test, y_test)

0.9998066665714277

# Prediction of y

In [59]:
y_pred = pipe.predict(imputed_X)

In [60]:
y_pred[:10]

array([2.03502322e+05, 9.35516249e+04, 2.11010520e+05, 2.43546696e+05,
       1.69372718e+03, 2.99265630e+06, 8.72223110e+03, 3.62528997e+05,
       6.66055768e+04, 1.72340406e+05])

# Dummy Regressor -> Find out the Baseline Mean Square Error

In [61]:
from sklearn.dummy import DummyRegressor

In [62]:
lr = LinearRegression().fit(imputed_X, y_train)

In [63]:
lr_dummy_mean = DummyRegressor(strategy = 'mean').fit(imputed_X, y_train)
  
lr_dummy_median = DummyRegressor(strategy = 'median').fit(imputed_X, y_train)

In [64]:
y_predict = lr.predict(imputed_X_test)

In [65]:
y_predict_dummy_mean = lr_dummy_mean.predict(imputed_X_test)
y_predict_dummy_median = lr_dummy_median.predict(imputed_X_test)

In [66]:
print('Linear model, coefficients: ', lr.coef_)
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, y_predict_dummy_mean, squared=False)))

Linear model, coefficients:  [ 0.8842284  -8.91044845 26.94832732 -0.9292501   0.        ]
Mean squared error (dummy): 4033958.63


# Root Mean Squared Error of the Pipeline

In [67]:
# Training RMSE
print('train:', mean_squared_error(y_train, pipe.predict(imputed_X), squared=False))

# Testing RMSE
print('test:', mean_squared_error(y_test, pipe.predict(imputed_X_test), squared=False))

train: 67150.65489625283
test: 56089.29249179605


# Cross val score

In [68]:
scores = cross_val_score(estimator=pipe, X=imputed_X, y=y_train, cv=5)
scores

array([0.9997245 , 0.99978726, 0.99967498, 0.99966866, 0.9997964 ])

In [69]:
# Generting cross validated predictions 
scores[:5]

array([0.9997245 , 0.99978726, 0.99967498, 0.99966866, 0.9997964 ])

In [70]:
# get the standard deviation from cross validated testing score
scores.std()

5.386410461939648e-05

In [71]:
# Running cross validation on the training subset
# cv changes the number of folds
cross_val_score(pipe, imputed_X, y_train, cv=7).mean()

0.9997144238972137

# R2 score

In [72]:
from sklearn import metrics

print('r2 score of the ridge regression: ',metrics.r2_score(y_train, y_pred))

print('mean squared error on the training: ', metrics.mean_squared_error(y_train, y_pred,squared=False))

r2 score of the ridge regression:  0.9997409821159169
mean squared error on the training:  67150.65489625283


# Baseline for y

In [73]:
# Baseline
y.mean()

666125.5059302326