In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('merged_df.csv')
data.iloc[:30]

Unnamed: 0,LocationAbbr,DataValue_dia01,YearStart,DataValue_dia02,DataValue_dia03,DataValue_dia04
0,AR,13.6,2019,11.9,105.4,1.9
1,AL,14.0,2019,6.8,67.1,2.3
2,CT,9.6,2019,11.9,61.6,1.3
3,CO,7.0,2019,7.4,78.1,1.5
4,GA,12.0,2019,10.2,70.4,1.4
5,FL,11.7,2019,9.9,80.7,1.2
6,IL,11.3,2019,9.9,66.5,1.3
7,LA,12.6,2019,7.9,87.1,1.6
8,KS,10.8,2019,10.9,85.4,1.8
9,KY,13.3,2019,10.9,155.9,3.4


In [91]:
correlation_matrix = data[['DataValue_dia01', 'DataValue_dia02', 'DataValue_dia03', 'DataValue_dia04']].corr()
correlation_matrix


Unnamed: 0,DataValue_dia01,DataValue_dia02,DataValue_dia03,DataValue_dia04
DataValue_dia01,1.0,0.072711,0.384904,0.261314
DataValue_dia02,0.072711,1.0,0.066009,-0.063419
DataValue_dia03,0.384904,0.066009,1.0,0.683671
DataValue_dia04,0.261314,-0.063419,0.683671,1.0


In [92]:
# Calculate VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor

predictor_data = data[['DataValue_dia02', 'DataValue_dia03', 'DataValue_dia04']]
predictor_data = predictor_data.assign(const=1)

vifs = pd.DataFrame({
    'Variable': predictor_data.columns,
    'VIF': [variance_inflation_factor(predictor_data.values, i) for i in range(predictor_data.shape[1])]
})

vifs


Unnamed: 0,Variable,VIF
0,DataValue_dia02,1.027201
1,DataValue_dia03,1.92092
2,DataValue_dia04,1.920274
3,const,41.695021


# Linear Regression

In [93]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X_train = data[['DataValue_dia02', 'DataValue_dia03', 'DataValue_dia04', 'LocationAbbr', 'YearStart']]
y_train = data['DataValue_dia01']

# encoding categorical data
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), ['LocationAbbr'])
], remainder='passthrough')

lr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

lr_model.fit(X_train, y_train)

# test data
test_data = pd.DataFrame({
    'LocationAbbr': ['AL'],
    'YearStart': [2019],
    'DataValue_dia02': [20],
    'DataValue_dia03': [70],
    'DataValue_dia04': [2]
})

# predicting
predicted_value = lr_model.predict(test_data)
print(f'Predicted DataValue_dia01: {predicted_value[0]}')


Predicted DataValue_dia01: 15.276269903247254


# Random Forest

In [94]:
from sklearn.ensemble import RandomForestRegressor

rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])

rf_model.fit(X_train, y_train)

predicted_value = rf_model.predict(test_data)
print(f'Predicted DataValue_dia01: {predicted_value[0]}')


Predicted DataValue_dia01: 13.570999999999993


# Ridge Regression

In [95]:
from sklearn.linear_model import Ridge

ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

ridge_model.fit(X_train, y_train)
predicted_ridge = ridge_model.predict(test_data)
print(f'Predicted DataValue_dia01 using Ridge: {predicted_ridge[0]}')


Predicted DataValue_dia01 using Ridge: 14.083300703862164


# Lasso Regression

In [96]:
from sklearn.linear_model import Lasso
lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.1))
])

lasso_model.fit(X_train, y_train)
predicted_lasso = lasso_model.predict(test_data)
print(f'Predicted DataValue_dia01 using Lasso: {predicted_lasso[0]}')


Predicted DataValue_dia01 using Lasso: 11.723822457711151


# Support Vector Regression

In [97]:
from sklearn.svm import SVR

svr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

svr_model.fit(X_train, y_train)
predicted_svr = svr_model.predict(test_data)
print(f'Predicted DataValue_dia01 using SVR: {predicted_svr[0]}')


Predicted DataValue_dia01 using SVR: 10.88714132358064


# Decision Tree Regression

In [98]:
from sklearn.tree import DecisionTreeRegressor

tree_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=0))
])

tree_model.fit(X_train, y_train)
predicted_tree = tree_model.predict(test_data)
print(f'Predicted DataValue_dia01 using Decision Tree: {predicted_tree[0]}')


Predicted DataValue_dia01 using Decision Tree: 14.0


# Evaluation with R-Square

In [99]:
r2_score_lr = lr_model.score(X_train, y_train)
print("R^2 Score for Random Forest:", r2_score_lr)

R^2 Score for Random Forest: 0.9835831924634394


In [100]:
r2_score_rf = rf_model.score(X_train, y_train)
print("R^2 Score for Random Forest:", r2_score_rf)

R^2 Score for Random Forest: 0.922726026507687


In [101]:
r2_score_ridge = ridge_model.score(X_train, y_train)
print("R^2 Score for Ridge Regression:", r2_score_ridge)

R^2 Score for Ridge Regression: 0.9148073098173102


In [102]:
r2_score_lasso = lasso_model.score(X_train, y_train)
print("R^2 Score for Lasso Regression:", r2_score_lasso)

R^2 Score for Lasso Regression: 0.2311490719181214


In [103]:
r2_score_svr = svr_model.score(X_train, y_train)
print("R^2 Score for Support Vector Regression:", r2_score_svr)

R^2 Score for Support Vector Regression: 0.003719411814264384


In [104]:
r2_score_tree = tree_model.score(X_train, y_train)
print("R^2 Score for Decision Tree Regression:", r2_score_tree)

R^2 Score for Decision Tree Regression: 1.0


# Evaluation with MSE

In [105]:
#select rows for testing
test_indices = [0, 20, 40, 60]  
X_test2 = X.iloc[test_indices]
y_test2 = y.iloc[test_indices]

X_train2 = X.drop(test_indices)
y_train2 = y.drop(test_indices)


## (1)Linear Regression

In [106]:
lr_model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
lr_model2.fit(X_train2, y_train2)
y_pred_linear2 = lr_model2.predict(X_test2)
mse_linear = mean_squared_error(y_test2, y_pred_linear2)
print(f"Mean Squared Error for Linear Regression: {mse_linear}")

Mean Squared Error for Linear Regression: 0.5154938336771008


## (2)Random Forest

In [107]:
rf_model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])
rf_model2.fit(X_train2, y_train2)
y_pred_rf2 = rf_model2.predict(X_test2)
mse_rf = mean_squared_error(y_test2, y_pred_rf2)
print(f"Mean Squared Error for Random Forest: {mse_rf}")

Mean Squared Error for Random Forest: 3.2372675000000144


## (3)Ridge Regression

In [108]:
ridge_model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])
ridge_model2.fit(X_train2, y_train2)
y_pred_ridge2 = ridge_model2.predict(X_test2)
mse_ridge = mean_squared_error(y_test2, y_pred_ridge2)
print(f"Mean Squared Error for Ridge Regression: {mse_ridge}")

Mean Squared Error for Ridge Regression: 1.1676445861914055


## (4)Lasso Regression

In [109]:
lasso_model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.1))
])
lasso_model2.fit(X_train2, y_train2)
y_pred_lasso2 = lasso_model2.predict(X_test2)
mse_lasso = mean_squared_error(y_test2, y_pred_lasso2)
print(f"Mean Squared Error for Lasso Regression: {mse_lasso}")

Mean Squared Error for Lasso Regression: 4.613816752929684


## (5)Support Vector 

In [110]:
svr_model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])
svr_model2.fit(X_train2, y_train2)
y_pred_svr2 = svr_model2.predict(X_test2)
mse_svr = mean_squared_error(y_test2, y_pred_svr2)
print(f"Mean Squared Error for Support Vector Regression: {mse_svr}")

Mean Squared Error for Support Vector Regression: 4.907649816155945


## (6)Decision Tress

In [111]:
tree_model2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=0))
])
tree_model2.fit(X_train2, y_train2)
y_pred_tree2 = tree_model2.predict(X_test2)
mse_tree = mean_squared_error(y_test2, y_pred_tree2)
print(f"Mean Squared Error for Decision Tree: {mse_tree}")

Mean Squared Error for Decision Tree: 7.452499999999999


# Model comparison

In [112]:
import plotly.graph_objects as go

In [113]:
r2_scores = {
    'Linear Regression': r2_score_lr,
    'Random Forest': r2_score_rf,
    'Ridge': r2_score_ridge,
    'Lasso': r2_score_lasso,
    'SVR': r2_score_svr,
    'Decision Tree': r2_score_tree
}


In [115]:

fig = go.Figure([go.Bar(x=list(r2_scores.keys()), y=list(r2_scores.values()), marker_color='pink')])

fig.update_layout(
    title='Comparison of R^2 Scores Across Different Regression Models',
    xaxis_title='Model Type',
    yaxis_title='R^2 Score',
    template='plotly_white'  # Change template as needed
)

fig.show()


In [116]:
mse_scores = {
    'Linear Regression': mse_linear,
    'Random Forest': mse_rf,
    'Ridge': mse_ridge,
    'Lasso': mse_lasso,
    'SVR': mse_svr,
    'Decision Tree': mse_tree
}

In [117]:
fig = go.Figure([go.Bar(x=list(mse_scores.keys()), y=list(mse_scores.values()), marker_color='lightblue')])

fig.update_layout(
    title='Comparison of MSE Across Different Regression Models',
    xaxis_title='Model Type',
    yaxis_title='MSE',
    template='plotly_white'  # Change template as needed
)

fig.show()