In [10]:
import pandas as pd
import altair as alt

mass_shootings_with_coordinates = pd.read_csv("data/corrected_filled_mass_shootings.csv")
state_features = pd.read_csv("data/state_features.csv")
state_features['State']=state_features['state']

In [11]:
# calculate the total mass shooting numbers of all years for all states
total_mass_shootings = mass_shootings_with_coordinates.groupby('State').size().reset_index(name='Mass Shootings Count')
total_mass_shootings.head()

Unnamed: 0,State,Mass Shootings Count
0,Alabama,114
1,Alaska,5
2,Arizona,54
3,Arkansas,54
4,California,419


In [12]:
# merge the mass shootings count with other features
merged_data = pd.merge(total_mass_shootings, state_features, on='State', how='inner')
merged_data.head()

Unnamed: 0.1,State,Mass Shootings Count,Unnamed: 0,state,Unemployment Rate,Arrested Ratio,GDP per capita,Population Density,Poverty Rate,id
0,Alabama,114,5,Alabama,0.029,0.509,61846,39.0,0.146,1
1,Alaska,5,40,Alaska,0.046,0.4,95147,0.5,0.104,2
2,Arizona,54,24,Arizona,0.036,0.333,73203,25.0,0.124,4
3,Arkansas,54,15,Arkansas,0.033,0.556,60276,23.0,0.157,5
4,California,419,48,California,0.054,0.277,104916,97.0,0.117,6


In [13]:
target_features = ['Unemployment Rate', 'Arrested Ratio', 'GDP per capita', 'Population Density', 'Poverty Rate']

# create a dropdown selection
dropdown_selection = alt.binding_select(options=target_features, name='Feature:')
feature_select = alt.selection_point(fields=['Selected Feature'], bind=dropdown_selection, value=target_features[0])

scatter_plot = alt.Chart(merged_data).transform_fold(
    target_features,
    as_=['Selected Feature', 'Value']
).transform_filter(
    feature_select
).mark_point(filled=True, size=100,color='#7f92b2').encode(
    x=alt.X('Value:Q', title='Selected Feature'),
    y=alt.Y('Mass Shootings Count:Q', title='Number of Mass Shootings'),
    color=alt.Color('Selected Feature:N', legend=None),
    tooltip=['State:N', 'Selected Feature:N', 'Value:Q', 'Mass Shootings Count:Q']
).add_params(
    feature_select
)

trend_line = alt.Chart(merged_data).transform_fold(
    target_features,
    as_=['Selected Feature', 'Value']
).transform_filter(
    feature_select
).transform_regression(
    'Value', 'Mass Shootings Count'
).mark_line(color='#BA0C2F').encode(
    x='Value:Q',
    y='Mass Shootings Count:Q'
)

final_chart = (scatter_plot + trend_line).properties(
    width=600,
    height=400,
    title='Mass Shootings Count vs Selected Feature by State'
)

final_chart.show()


To remove three statistically significant outliers for each feature and then redraw the regression trend line:

* Determine the standard for outliers: We can use the z-score method to identify outliers. This involves calculating each point's deviation from the mean in terms of standard deviations. If the z-score exceeds a certain threshold, that point is considered an outlier. To remove three outliers, we need to select the three points with the highest z-scores.
* Remove outliers for each feature: Calculate the z-score for each feature and remove the three points with the highest z-scores.
* Redraw the scatter plot and trend line: After removing the outliers, recreate the scatter plot and calculate the regression trend line again.

In [14]:
import numpy as np
target_features = ['Unemployment Rate', 'Arrested Ratio', 'GDP per capita', 'Population Density', 'Poverty Rate']

filtered_data_list = []
outliers_removed_list = []
for feature in target_features:
    feature_data = merged_data[['State', feature, 'Mass Shootings Count']].dropna()
    
    # compute z-score
    feature_data['z_score'] = np.abs((feature_data[feature] - feature_data[feature].mean()) / feature_data[feature].std())

    outliers_removed = feature_data.sort_values(by='z_score', ascending=False).head(3)
    outliers_removed['Selected Feature'] = feature
    outliers_removed_list.append(outliers_removed)
    
    # ascending: remove the last 3
    filtered_data = feature_data.sort_values(by='z_score').iloc[:-3]
    filtered_data['Selected Feature'] = feature
    filtered_data_list.append(filtered_data)

outliers_removed_all_features = pd.concat(outliers_removed_list, ignore_index=True)

filtered_data_all_features = pd.concat(filtered_data_list, ignore_index=True)
print(outliers_removed_all_features)



                   State  Unemployment Rate  Mass Shootings Count   z_score  \
0                 Nevada              0.057                    36  2.205972   
1   District of Columbia              0.057                    76  2.205972   
2           South Dakota              0.019                     4  2.077769   
3                Wyoming                NaN                     1  2.950777   
4                Vermont                NaN                     1  2.950777   
5                 Hawaii                NaN                     1  2.950777   
6   District of Columbia                NaN                    76  6.148223   
7               New York                NaN                   206  1.141113   
8            Mississippi                NaN                   104  1.064771   
9   District of Columbia                NaN                    76  6.892942   
10            New Jersey                NaN                   105  0.542055   
11          Rhode Island                NaN         

In [15]:
filtered_data_all_features['Value'] = filtered_data_all_features.apply(
    lambda row: row[row['Selected Feature']], axis=1
)

filtered_data_all_features['Value'] 

0      0.037
1      0.037
2      0.038
3      0.036
4      0.036
       ...  
235    0.149
236    0.067
237    0.152
238    0.156
239    0.157
Name: Value, Length: 240, dtype: float64

In [16]:
single_select = alt.selection_point(empty='all', on='click')

scatter_plot2 = alt.Chart(filtered_data_all_features).transform_filter(
    feature_select
).mark_point(filled=True, size=130).encode(
    x=alt.X('Value:Q', title='Selected Feature'),
    y=alt.Y('Mass Shootings Count:Q', title='Number of Mass Shootings'),
     color=alt.condition(
        single_select, 
        alt.value('#7f92b2'), 
        alt.value('lightgray')  
    ),
    tooltip=['State:N', 'Selected Feature:N', 'Value:Q', 'Mass Shootings Count:Q']
).add_params(
    feature_select,
    single_select
)

trend_line2 = alt.Chart(filtered_data_all_features).transform_filter(
    feature_select
).transform_regression(
    'Value', 'Mass Shootings Count'
).mark_line(color='#BA0C2F').encode(
    x='Value:Q',
    y='Mass Shootings Count:Q'
)


final_chart2 = (scatter_plot2 + trend_line2).properties(
    width=400,
    height=300,
    title='Mass Shootings Count vs Selected Feature by State (3 Outliers Removed)'
)
final_chart2

Now we test if the relationshipn between features and number of massshootings of a state is significant.

In [17]:
import statsmodels.api as sm
import pandas as pd

regression_results = []

for feature in target_features:
    feature_data = filtered_data_all_features[filtered_data_all_features['Selected Feature'] == feature]
    feature_data = feature_data[['Value', 'Mass Shootings Count']].dropna()

    X = feature_data['Value']
    y = feature_data['Mass Shootings Count']

    X = sm.add_constant(X)

    model = sm.OLS(y, X).fit()

    p_value = model.pvalues.iloc[1]  
    coefficient = model.params.iloc[1]  
    intercept = model.params.iloc[0]  

    significance = 'Significant' if p_value < 0.1 else 'Not Significant'

    regression_results.append({
        'Feature': feature,
        'Coefficient': coefficient,
        'Intercept': intercept,
        'p-value': p_value,
        'Significance': significance
    })

regression_results_df = pd.DataFrame(regression_results)
print(regression_results_df)

              Feature  Coefficient   Intercept   p-value     Significance
0   Unemployment Rate  6410.992908 -142.019238  0.001134      Significant
1      Arrested Ratio  -236.553835  188.989437  0.040982      Significant
2      GDP per capita     0.001125   -0.817022  0.372587  Not Significant
3  Population Density     0.360094   70.390370  0.088629      Significant
4        Poverty Rate  1303.545943  -46.115261  0.046175      Significant


In [18]:
final_chart2.save('Viz4.html')