In [2]:
import pandas as pd
import numpy as np

## Correlation matrix

In [3]:
cleaned_data = pd.read_csv('cleaned_data.csv')
dia01 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA01') 
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL') 
                     & (cleaned_data['DataValueTypeID'] == 'CRDPREV') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                     & (cleaned_data['YearStart'] == 2019)
                     ][['LocationAbbr', 'DataValue']]
dia02 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA02') 
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL') 
                     & (cleaned_data['DataValueTypeID'] == 'CRDPREV') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                     & (cleaned_data['YearStart'] == 2019)
                     ][['LocationAbbr', 'DataValue']]
dia03 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA03')
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL')
                     & (cleaned_data['DataValueTypeID'] == 'CRDRATE') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                     & (cleaned_data['YearStart'] == 2019) 
                     ][['LocationAbbr', 'DataValue']]
dia04 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA04')
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL')
                     & (cleaned_data['DataValueTypeID'] == 'CRDRATE') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                     & (cleaned_data['YearStart'] == 2019) 
                     ][['LocationAbbr', 'DataValue']]


In [4]:
final_df = pd.merge(dia01, dia02, on='LocationAbbr', suffixes=('_dia01', '_dia02'))
final_df = pd.merge(final_df, dia03, on='LocationAbbr')
final_df = pd.merge(final_df, dia04, on='LocationAbbr', suffixes=('_dia03', '_dia04'))

correlation_matrix = final_df[['DataValue_dia01', 'DataValue_dia02', 'DataValue_dia03', 'DataValue_dia04']].corr()

np.array(correlation_matrix)

array([[1.        , 0.09190151, 0.35802157, 0.36094717],
       [0.09190151, 1.        , 0.16244198, 0.0399327 ],
       [0.35802157, 0.16244198, 1.        , 0.51685044],
       [0.36094717, 0.0399327 , 0.51685044, 1.        ]])

In [6]:
import numpy as np
import plotly.graph_objects as go

# Create a heatmap
fig = go.Figure(data=go.Heatmap(
        z=np.array(correlation_matrix),
        colorscale='reds'))

# Update layout
fig.update_layout(
    title='Correlation Matrix',
    xaxis=dict(title='Variables',
               tickmode='array',
               tickvals=[0,1,2,3],
               ticktext=['DIA01', 'DIA02', 'DIA03', 'DIA04']),
    yaxis=dict(title='Variables',
               tickmode='array',
               tickvals=[0,1,2,3],
               ticktext=['DIA01', 'DIA02', 'DIA03', 'DIA04']),
    height=500,  # Adjust the height to your preference
    width=500   # Adjust the width to your preference
)

# Show the plot
fig.show()

## Machine learning

In [17]:
cleaned_data = pd.read_csv('cleaned_data.csv')
dia01 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA01') 
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL') 
                     & (cleaned_data['DataValueTypeID'] == 'CRDPREV') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                    #  & (cleaned_data['YearStart'] == 2019)
                     ][['LocationAbbr', 'DataValue', 'YearStart']]
dia02 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA02') 
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL') 
                     & (cleaned_data['DataValueTypeID'] == 'CRDPREV') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                    #  & (cleaned_data['YearStart'] == 2019)
                     ][['LocationAbbr', 'DataValue', 'YearStart']]
dia03 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA03')
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL')
                     & (cleaned_data['DataValueTypeID'] == 'CRDRATE') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                    #  & (cleaned_data['YearStart'] == 2019) 
                     ][['LocationAbbr', 'DataValue', 'YearStart']]
dia04 = cleaned_data[(cleaned_data['QuestionID'] == 'DIA04')
                     & (cleaned_data['StratificationCategoryID1'] == 'OVERALL')
                     & (cleaned_data['DataValueTypeID'] == 'CRDRATE') 
                    #  & (cleaned_data['LocationAbbr'] == 'AL')
                    #  & (cleaned_data['YearStart'] == 2019) 
                     ][['LocationAbbr', 'DataValue', 'YearStart']]


In [18]:
merged_df = pd.merge(dia01, dia02, on=['LocationAbbr', 'YearStart'], suffixes=('_dia01', '_dia02'))
merged_df = pd.merge(merged_df, dia03, on=['LocationAbbr', 'YearStart'])
merged_df = pd.merge(merged_df, dia04, on=['LocationAbbr', 'YearStart'], suffixes=('_dia03', '_dia04'))


### Model training

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Drop the target variable 'DataValue_dia01' from x_train
x_train = merged_df.drop(['DataValue_dia01'], axis=1)

# Define columns to be one-hot encoded
columns_to_encode = ['LocationAbbr']

# Perform one-hot encoding
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns_to_encode)], remainder='passthrough')
x_train_encoded = ct.fit_transform(x_train)


# Fit a linear model
model = LinearRegression()
model.fit(x_train_encoded, merged_df['DataValue_dia01'])  # Using 'DataValue_dia01' as y_train for the model

r2_score = model.score(x_train_encoded, merged_df['DataValue_dia01'])
print("R^2 Score:", r2_score)

R^2 Score: 0.9835831924634222


### Model testing

In [19]:
test_data = pd.DataFrame({'LocationAbbr': ['AL'], 
                          'YearStart': [2019], 
                          'DataValue_dia02': [20], 
                          'DataValue_dia03': [70],
                          'DataValue_dia04': [2]})

test_data_encoded = ct.transform(test_data)

prediction = model.predict(test_data_encoded)

print("Prediction:", prediction[0])

Prediction: 15.276270126864688


## Line chart

In [6]:
# import plotly.graph_objs as go
# import plotly.io as pio

# # Create a scatter trace
# trace = go.Scatter(
#     x=mortality_AL['YearStart'],
#     y=mortality_AL['DataValue'],
#     mode='lines+markers',
#     name='Mortality Rate for AL'
# )

# # Create the figure
# fig = go.Figure(data=[trace])

# # Update layout
# fig.update_layout(
#     title='Mortality Rate for Alabama (2019-2021)',
#     xaxis_title='Year',
#     yaxis_title='DataValue',
#     xaxis=dict(
#         tickmode='array',
#         tickvals=[2019, 2020, 2021]
#     )    
# )

# # Show the plot
# pio.show(fig)

In [48]:
# import plotly.graph_objs as go
# import plotly.io as pio

# # Create a bar plot
# fig = go.Figure(data=[go.Bar(
#     x=npw_data['LocationAbbr'],  # LocationAbbr on x-axis
#     y=npw_data['DataValue'],     # DataValue on y-axis
#     marker_color='skyblue'       # Set color of the bars
# )])

# # Update layout
# fig.update_layout(
#     title='Diabetes among adults',
#     xaxis_title='Location',
#     yaxis_title='Percentage',
#     xaxis=dict(tickangle=45)     # Rotate x-axis labels for better readability
# )

# # Show the plot
# pio.show(fig)