In [1]:
import csv
import datetime
import math
import numpy as np

f = open("guns.csv", "r")
data = csv.reader(f)
data = list(data)
print(data[:5])

[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education'], ['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4']]


In [2]:
# A function to generate dictionary counts from a given list
def counts(data):
    col_count = {}
    
    for item in data:
        if item not in col_count:
            col_count[item] = 1
        else:
            col_count[item] += 1
            
    return(col_count)

In [10]:
# An apparently overly specialized function. Will split a dictionary based
# on given bounds. Would be more useful if it calculated a representative
# split and output the hi, lo or middle depending on user input.
def value_bound(data, up_bound = None, lo_bound = None, mid_return = False):
    vals = {}; mid_vals = {}
    no_bound_vals = []
    
    for key, value in data.items():
        if up_bound is None and lo_bound is None:
            no_bound_vals.append(value)
        elif up_bound is not None and value > up_bound:
            vals[key] = value
        elif lo_bound is not None and value < lo_bound:
            vals[key] = value
        else:
            mid_vals[key] = value

    if len(no_bound_vals) > 0:
        print("No bounds given. Splitting about the mean... \n")
        mean = np.mean(no_bound_vals)
        
        hi_lo = input("Input 1 to return vals > mean. Input 2 to return vals < mean...\n>")
        if hi_lo == 1:
            up_half = [x for x in no_bound_vals if x > mean]
            return(up_half)
        elif hi_lo == 2:
            lo_half = [x for x in no_bound_vals if x < mean]
            return(lo_half)
        else:
            return("Invalid input.")
        
    if mid_return == True:
        return(mid_vals)
            
    return(vals)


In [11]:
# A formatting function, used to output the mean, variance, and standard
# deviation of a given dictionary of int/float valuees
def summary(data, output = True, title = None, data_print = False):
    explicit_count = {}
    values = []
    
    for key, value in data.items():
        keyf = key
        
        #If it's a datetime, format to show month and year
        if isinstance(key, datetime.datetime):
            keyf = key.strftime("%b %Y")
        
        explicit_count[keyf] = value
        values.append(value)
    
    packet = {"avg": np.mean(values), "var": np.var(values), "std": np.std(values)}
    
    if title is not None:
        print(title)    
    
    if output == True:
        print("\n\tAvg: {avg:.2f} \n\tVar: {var:.2f} \n\tStd. Dev: {std:.2f} \n".format(**packet))
    
    if data_print == True:
        print("Dataset: \n", explicit_count, "\n\n")

In [3]:
# Removing the header row from the dataset and storing it for reference
headers = data.pop(0)
print(headers)
print(data[:5])

['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education']
[['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4'], ['5', '2012', '02', 'Suicide', '0', 'M', '31', 'White', '100', 'Other specified', '2']]


In [4]:
years = [row[1] for row in data]
year_counts = counts(years)
print(year_counts)

{'2014': 33599, '2013': 33636, '2012': 33563}


In [5]:
#Using a list comp to pull columns of interest
dates = [datetime.datetime(year=int(row[1]), month=int(row[2]), day = 1) for row in data]
print(dates[:5])

[datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0)]


In [6]:
# Passing the generated list to our counting function
date_counts = counts(dates)
#print(date_counts)

{datetime.datetime(2012, 8, 1, 0, 0): 2954, datetime.datetime(2014, 2, 1, 0, 0): 2361, datetime.datetime(2013, 7, 1, 0, 0): 3079, datetime.datetime(2012, 9, 1, 0, 0): 2852, datetime.datetime(2012, 12, 1, 0, 0): 2791, datetime.datetime(2013, 10, 1, 0, 0): 2808, datetime.datetime(2013, 8, 1, 0, 0): 2859, datetime.datetime(2014, 3, 1, 0, 0): 2684, datetime.datetime(2013, 9, 1, 0, 0): 2742, datetime.datetime(2013, 1, 1, 0, 0): 2864, datetime.datetime(2013, 2, 1, 0, 0): 2375, datetime.datetime(2013, 6, 1, 0, 0): 2920, datetime.datetime(2013, 4, 1, 0, 0): 2798, datetime.datetime(2012, 5, 1, 0, 0): 2999, datetime.datetime(2013, 11, 1, 0, 0): 2758, datetime.datetime(2014, 12, 1, 0, 0): 2857, datetime.datetime(2014, 1, 1, 0, 0): 2651, datetime.datetime(2014, 4, 1, 0, 0): 2862, datetime.datetime(2012, 2, 1, 0, 0): 2357, datetime.datetime(2013, 12, 1, 0, 0): 2765, datetime.datetime(2013, 5, 1, 0, 0): 2806, datetime.datetime(2012, 10, 1, 0, 0): 2733, datetime.datetime(2014, 9, 1, 0, 0): 2914, date

In [7]:
# Printing all values greater than 2900 in our date counts. 
dates_over_2900 = [print(k,": ", v) for k, v in date_counts.items() if v > 2900]

2012-08-01 00:00:00 :  2954
2013-07-01 00:00:00 :  3079
2013-06-01 00:00:00 :  2920
2012-05-01 00:00:00 :  2999
2014-09-01 00:00:00 :  2914
2014-06-01 00:00:00 :  2931
2012-07-01 00:00:00 :  3026
2014-08-01 00:00:00 :  2970


In [8]:
# Printing all values greater than 2900 in our date counts. 
dates_under_2800 = [print(k,": ", v) for k, v in date_counts.items() if v < 2750]

2014-02-01 00:00:00 :  2361
2014-03-01 00:00:00 :  2684
2013-09-01 00:00:00 :  2742
2013-02-01 00:00:00 :  2375
2014-01-01 00:00:00 :  2651
2012-02-01 00:00:00 :  2357
2012-10-01 00:00:00 :  2733
2012-11-01 00:00:00 :  2729
2012-03-01 00:00:00 :  2743


In [12]:
hi_vals = value_bound(date_counts, up_bound=2900)
lo_vals = value_bound(date_counts, lo_bound=2750)
mid_vals = value_bound(date_counts, up_bound=2900, lo_bound=2750, mid_return=True)

In [13]:
summary(hi_vals, title = "Hi Count Summary", data_print = True)
summary(lo_vals, title = "Lo Count Summary", data_print = True)
summary(mid_vals, title = "Mid Count Summary", data_print = True)

Hi Count Summary

	Avg: 2974.12 
	Var: 2891.86 
	Std. Dev: 53.78 

Dataset: 
 {'Aug 2014': 2970, 'Jul 2012': 3026, 'Jun 2013': 2920, 'Jun 2014': 2931, 'Jul 2013': 3079, 'Aug 2012': 2954, 'Sep 2014': 2914, 'May 2012': 2999} 


Lo Count Summary

	Avg: 2597.22 
	Var: 27925.06 
	Std. Dev: 167.11 

Dataset: 
 {'Mar 2012': 2743, 'Oct 2012': 2733, 'Feb 2014': 2361, 'Mar 2014': 2684, 'Nov 2012': 2729, 'Jan 2014': 2651, 'Feb 2012': 2357, 'Feb 2013': 2375, 'Sep 2013': 2742} 


Mid Count Summary

	Avg: 2822.63 
	Var: 1812.55 
	Std. Dev: 42.57 

Dataset: 
 {'Nov 2014': 2756, 'May 2013': 2806, 'Sep 2012': 2852, 'Oct 2014': 2865, 'Dec 2012': 2791, 'Dec 2014': 2857, 'Mar 2013': 2862, 'Dec 2013': 2765, 'Jun 2012': 2826, 'Oct 2013': 2808, 'Jan 2012': 2758, 'Jan 2013': 2864, 'Jul 2014': 2884, 'Apr 2013': 2798, 'Nov 2013': 2758, 'May 2014': 2864, 'Apr 2012': 2795, 'Aug 2013': 2859, 'Apr 2014': 2862} 




In [14]:
sex = [row[5] for row in data]
sex_counts = counts(sex)
print(sex_counts)

{'F': 14449, 'M': 86349}


In [15]:
races = [row[7] for row in data]
race_counts = counts(races)
print(race_counts)

{'Asian/Pacific Islander': 1326, 'White': 66237, 'Black': 23296, 'Hispanic': 9022, 'Native American/Native Alaskan': 917}


In [17]:
race_sum = summary(race_counts)


	Avg: 20159.60 
	Var: 596398414.64 
	Std. Dev: 24421.27 

