In [1]:
import csv
import datetime
import numpy as np

f = open("guns.csv", "r")
data = csv.reader(f)
data = list(data)
print(data[:5])

[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education'], ['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4']]


In [2]:
# An apparently overly specialized function. Will split a dictionary based
# on given bounds. Would be more useful if it calculated a representative
# split and output the hi, lo or middle depending on user input.
def value_bound(data, up_bound = None, lo_bound = None, mid_return = False):
    vals = {}; mid_vals = {}
    no_bound_vals = []
    
    for key, value in data.items():
        if up_bound is None and lo_bound is None:
            no_bound_vals.append(value)
        elif up_bound is not None and value > up_bound:
            vals[key] = value
        elif lo_bound is not None and value < lo_bound:
            vals[key] = value
        else:
            mid_vals[key] = value

    if len(no_bound_vals) > 0:
        print("No bounds given. Splitting about the mean... \n")
        mean = np.mean(no_bound_vals)
        
        hi_lo = input("Input 1 to return vals > mean. Input 2 to return vals < mean...\n>")
        if hi_lo == 1:
            up_half = [x for x in no_bound_vals if x > mean]
            return(up_half)
        elif hi_lo == 2:
            lo_half = [x for x in no_bound_vals if x < mean]
            return(lo_half)
        else:
            return("Invalid input.")
        
    if mid_return == True:
        return(mid_vals)
            
    return(vals)


In [3]:
# A formatting function, used to output the mean, variance, and standard
# deviation of a given dictionary of values. 
def summary(data, output = True, title = None, data_print = False):
    explicit_count = {}
    values = []
    
    for key, value in data.items():
        keyf = key
        
        #If it's a datetime, format to show month and year
        if isinstance(key, datetime.datetime):
            keyf = key.strftime("%b %Y")
        
        explicit_count[keyf] = value
        values.append(value)
    
    packet = {"avg": np.mean(values), "var": np.var(values), "std": np.std(values)}
    
    if title is not None:
        print(title)    
    
    if output == True:
        print("\n\tAvg: {avg:.2f} \n\tVar: {var:.2f} \n\tStd. Dev: {std:.2f} \n".format(**packet))
    
    if data_print == True:
        print("Dataset: \n", explicit_count, "\n\n")

In [4]:
# A function to generate dictionary counts from a given list
def counts(data, i = None):
    
    if isinstance(data, dict):
        relist = [v for k, v in data.items()]
        #print(relist)
        data = [row[i] for row in relist]

    col_count = {}
    
    for item in data:
        if item not in col_count:
            col_count[item] = 1
        else:
            col_count[item] += 1
            
        
    return(col_count)

In [5]:
# data_record is a list, 
# col is the value of the column in the list you want to consider
# key_value is the category you are isolating
# If excl_set is set to True, the method will return the excluded set
def category_split(data_record, col, key_value, inner_col = None, excl_set=False):
    i = 0
    cat_split = {}
    secondary_split = {}
      
    for row in data_record:
        if excl_set == True:
            if row[col] != key_value:
                cat_split[i] = row
                i += 1
        elif row[col] == key_value:
            cat_split[i] = row
            i += 1
            
    if inner_col is not None:
        secondary_split = counts(cat_split, i = inner_col)
        return(cat_split, secondary_split)
    
    return(cat_split)

In [6]:
# Removing the header row from the dataset and storing it for reference
headers = data.pop(0)
print(headers)
print(data[:5])

['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education']
[['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4'], ['5', '2012', '02', 'Suicide', '0', 'M', '31', 'White', '100', 'Other specified', '2']]


In [7]:
years = [row[1] for row in data]
year_counts = counts(years)
print(year_counts)

{'2014': 33599, '2013': 33636, '2012': 33563}


In [8]:
#Using a list comp to pull columns of interest
dates = [datetime.datetime(year=int(row[1]), month=int(row[2]), day = 1) for row in data]
print(dates[:5])

[datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0)]


In [9]:
# Passing the generated list to our counting function
date_counts = counts(dates)

In [10]:
hi_vals = value_bound(date_counts, up_bound=2900)
lo_vals = value_bound(date_counts, lo_bound=2750)
mid_vals = value_bound(date_counts, up_bound=2900, lo_bound=2750, mid_return=True)

In [11]:
summary(hi_vals, title = "Hi Count Summary", data_print = True)
summary(lo_vals, title = "Lo Count Summary", data_print = True)
summary(mid_vals, title = "Mid Count Summary", data_print = True)

Hi Count Summary

	Avg: 2974.12 
	Var: 2891.86 
	Std. Dev: 53.78 

Dataset: 
 {'Jul 2012': 3026, 'Aug 2014': 2970, 'Jun 2014': 2931, 'Jul 2013': 3079, 'Jun 2013': 2920, 'May 2012': 2999, 'Aug 2012': 2954, 'Sep 2014': 2914} 


Lo Count Summary

	Avg: 2597.22 
	Var: 27925.06 
	Std. Dev: 167.11 

Dataset: 
 {'Jan 2014': 2651, 'Sep 2013': 2742, 'Feb 2012': 2357, 'Nov 2012': 2729, 'Mar 2012': 2743, 'Mar 2014': 2684, 'Feb 2014': 2361, 'Oct 2012': 2733, 'Feb 2013': 2375} 


Mid Count Summary

	Avg: 2822.63 
	Var: 1812.55 
	Std. Dev: 42.57 

Dataset: 
 {'Jul 2014': 2884, 'Oct 2013': 2808, 'May 2014': 2864, 'Aug 2013': 2859, 'Apr 2012': 2795, 'Jan 2012': 2758, 'Nov 2014': 2756, 'Jan 2013': 2864, 'Apr 2014': 2862, 'Dec 2012': 2791, 'Oct 2014': 2865, 'May 2013': 2806, 'Dec 2014': 2857, 'Apr 2013': 2798, 'Mar 2013': 2862, 'Nov 2013': 2758, 'Dec 2013': 2765, 'Sep 2012': 2852, 'Jun 2012': 2826} 




## My self-imposed lower-bound seemed ill defined ##
I believed this because the standard deviation for the low count is far higher than that of the middle and upper bounds. Looking more closely, I realized that the three outliers were in February of each year, the only months with fewer than 2400 gun deaths. The next fewest gun deaths occurred  in Jan. & Mar. 2014, at 2651 and 2684 respectively. 

I decided to compare the total data with and without february

In [1]:
with_feb = value_bound(date_counts, up_bound = 2300)
summary(with_feb, title = "Gun Deaths 2012 thru 2014")
without_feb = value_bound(date_counts, up_bound=2400)
summary(without_feb, title = "Gun Deaths Less February")

NameError: name 'value_bound' is not defined

## Excluding February brings average gun deaths per month up by 40 ##
This tells us that February is a month where gun deaths drop drastically. 

In [13]:
races = [row[7] for row in data]
race_counts = counts(races)
print(race_counts)

{'Hispanic': 9022, 'White': 66237, 'Native American/Native Alaskan': 917, 'Black': 23296, 'Asian/Pacific Islander': 1326}


In [14]:
sexes = [row[5] for row in data]
sex_counts = counts(sexes)
print(sex_counts)

{'M': 86349, 'F': 14449}


## A breakdown of gun deaths by race show that White victims drastically outnumber every other category ##
Without comparison to the percentage of the population in each racial category, the number of victims don't necessarily imply anything surprising. The gender breakdown shows a much greater number of males suffering gun deaths than females. We know the majority of the U.S. population is female, so this can be viewed as statstically significant.

I was curious if there was any other notable outliers in February compared to other data values.

In [15]:
# splitting our data into black gun victims and non-black gun victims
black_split = category_split(data, 7, 'Black')
non_black_split = category_split(data, 7, 'Black', excl_set = True)
white_split = category_split(data, 7, 'White')
native_am_split = category_split(data, 7, 'Native American/Native Alaskan')
hispanic_split = category_split(data, 7, 'Hispanic')
asian_split = category_split(data, 7, 'Asian/Pacific Islander')

as_split, as_intent = category_split(data, 7, 'Asian/Pacific Islander', inner_col = 3)
na_split, na_intent = category_split(data, 7, 'Native American/Native Alaskan', inner_col = 3)
bl_split, bl_intent =  category_split(data, 7, 'Black', inner_col = 3)
wh_split, wh_intent =  category_split(data, 7, 'White', inner_col = 3)
hs_split, hs_intent = category_split(data, 7, 'Hispanic', inner_col = 3)

#print(black_intent_counts)
#print(black_intent_counts_2)
#print("Asian Split: \n", as_split, "\nAsian Intent: \n", as_intent)
#summary(as_intent, data_print = True)
#print("Black Intent: \n", bl_intent)
#summary(bl_intent)
#print("White Intent: \n", wh_intent)
#summary(wh_intent)
#print("Hispanic Intent: \n", hs_intent)
#summary(hs_intent)
#print("Native American Intent: \n", na_intent)
#summary(na_intent)

In [16]:
print(len(black_split))
print(len(non_black_split))
print(len(data))
print(len(bl_split))
print(bl_intent)

23296
77502
100798
23296
{'Suicide': 3332, 'Homicide': 19510, 'Accidental': 328, 'Undetermined': 126}


In [17]:
f2 = open("census.csv", "r")
census = csv.reader(f2)
census = list(csv.reader(f2))

In [18]:
census

[['Id',
  'Year',
  'Id',
  'Sex',
  'Id',
  'Hispanic Origin',
  'Id',
  'Id2',
  'Geography',
  'Total',
  'Race Alone - White',
  'Race Alone - Hispanic',
  'Race Alone - Black or African American',
  'Race Alone - American Indian and Alaska Native',
  'Race Alone - Asian',
  'Race Alone - Native Hawaiian and Other Pacific Islander',
  'Two or More Races'],
 ['cen42010',
  'April 1, 2010 Census',
  'totsex',
  'Both Sexes',
  'tothisp',
  'Total',
  '0100000US',
  '',
  'United States',
  '308745538',
  '197318956',
  '44618105',
  '40250635',
  '3739506',
  '15159516',
  '674625',
  '6984195']]

In [19]:
race_counts

{'Asian/Pacific Islander': 1326,
 'Black': 23296,
 'Hispanic': 9022,
 'Native American/Native Alaskan': 917,
 'White': 66237}

In [20]:
race_totals = census[1][10:16]
race_totals = [int(x) for x in race_totals]
print(race_totals)
mapping = {"White": race_totals[0],
           "Black": race_totals[2],
           "Hispanic": race_totals[1],
           "Native American/Native Alaskan": race_totals[3],
           "Asian/Pacific Islander": int(race_totals[4]) + int(race_totals[5])}

[197318956, 44618105, 40250635, 3739506, 15159516, 674625]


In [21]:
race_per_hundredk = {}

In [22]:
for race, value in race_counts.items():
    race_per_hundredk[race] = float(value) / mapping[race] * 100000

In [23]:
race_per_hundredk

{'Asian/Pacific Islander': 8.374309664161762,
 'Black': 57.8773477735196,
 'Hispanic': 20.220491210910907,
 'Native American/Native Alaskan': 24.521955573811088,
 'White': 33.56849303419181}

In [24]:
intents = [row[3] for row in data]

In [25]:
races = [row[7] for row in data]

In [26]:
races[1]

'White'

In [27]:
homicide_rate_per_hundredk = {}

for item in list(enumerate(races)):
    i = item[0]
    race = item[1]
    
    if intents[i] == 'Homicide':
        if race not in homicide_rate_per_hundredk:
            homicide_rate_per_hundredk[race] = 1
            i += 1
        else:
            homicide_rate_per_hundredk[race] += 1
            i += 1

In [33]:
for race, value in homicide_rate_per_hundredk.items():
    homicide_rate_per_hundredk[race] = float(value) / mapping[race] * 100000

In [31]:
ages = [row[6] for row in data]

In [34]:
homicide_rate_per_hundredk

{'Asian/Pacific Islander': 3.530346230970155,
 'Black': 48.471284987180944,
 'Hispanic': 12.627161104219914,
 'Native American/Native Alaskan': 8.717729026240365,
 'White': 4.6356417981453335}

## The rate of gun victims due to homicide per hundred thousand are 16 times more for Blacks than Whites. ##

They're 4 times greater for Blacks than Hispanics. Of interest is the average age of homicide victims, and the race of the perpetrator in homicide shootings. It would also be wortwhile to explore location beyond just at home or out. One assumes most black gun victims due to homicide were in inner city areas. It would be interesting to explore how intent in black shootings 