In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
shooters = pd.read_csv('Data/Stanford_MSA_Database.csv')

OK, so there's lots of things we can do with the dataset given that it's a full 335 rows with 55 columns of descriptive text data each. Let's start by coming up with some basic questions we want answered:
- What kinds of weapons are most popular, and where?
- Where are guns most readily accessible in the country?
- How do warning signs change with age and other factors?
- Does lethality change with age?
- Have shootings become, on the whole, more lethal over time?
- How do the number of fatalities compare to the total number of victims?
- Has the number of guns used in a shooting changed on average over time?
- What happens to the shooter, and how does that change with age and race? Does their fate change substantially in incidences of school shootings?
- How do the motivations of mass shooters differ with geography?
- Does military experience alter the motive?

In [4]:
shooters

Unnamed: 0,CaseID,Title,Location,City,State,Latitude,Longitude,Number of Civilian Fatalities,Number of Civilian Injured,Number of Enforcement Fatalities,...,Data Source 3,Data Source 4,Data Source 5,Data Source 6,Data Source 7,Military Experience,Class,Depreciation,Notes,Edit Date
0,1,University of Texas at Austin,"Austin, Texas",Austin,Texas,30.198887,-97.844159,15,32,1,...,http://news.google.com/newspapers?id=lkk0AAAAI...,http://news.google.com/newspapers?id=PPUjAAAAI...,http://books.google.com/books?id=ClYEAAAAMBAJ&...,,,Yes,SPK,1,,6/8/2016
1,2,Rose-Mar College of Beauty,"Mesa, Arizona",Mesa,Arizona,33.422687,-111.816320,5,1,0,...,http://www.nydailynews.com/news/crime/beauty-s...,http://books.google.com/books?id=Cre7qsswRiwC&...,,,,Unknown,MS,1,,6/8/2016
2,3,New Orleans Police Shootings,"New Orleans, Louisiana",New Orleans,Louisiana,30.068724,-89.931474,4,8,5,...,http://www.trutv.com/library/crime/notorious_m...,http://books.google.com/books?id=TfEDmROcZwEC&...,,,,Unknown,SPK,1,,6/22/2016
3,4,Clara Barton Elementary School,"Chicago, Illinois",Chicago,Illinois,41.839280,-87.688181,1,3,0,...,http://www.leagle.com/xmlResult.aspx?page=1&xm...,,,,,Unknown,MS,1,,6/21/2016
4,5,Olean High School,"Olean, New York",Olean,New York,42.081854,-78.432139,3,7,0,...,http://www.newswithviews.com/Erica/Carle10.htm,http://books.google.com/books?id=ZuKoSskEWyIC&...,"""Olean High School Shooting"". Larrie Benton Za...",,,Unknown,MS,1,,6/21/2016
5,6,Los Angeles Computer Learning Center,"Los Angeles, California",Los Angeles,California,34.176221,-118.539954,1,6,1,...,http://www.newspapers.com/newspage/15323499/,,,,,Unknown,MS,1,,6/22/2016
6,7,Cal State Fullerton,"Fullerton, California",Fullerton,California,33.884042,-117.927850,7,2,0,...,http://www.dailytitan.com/2011/11/csuf-massacr...,http://criminalminds.wikia.com/wiki/Edward_All...,http://www.dailytitan.com/2011/11/csuf-massacr...,,,Unknown,MS,1,,6/21/2016
7,8,Grover Cleveland Elementary School,"San Diego, California",San Diego,California,32.863573,-117.128163,2,8,0,...,http://www.examiner.com/article/the-san-diego-...,http://signofthetimes.yuku.com/topic/1258#.Uvb...,https://www.mail-archive.com/seeknfind@ashlist...,,,Unknown,MS,1,,6/22/2016
8,9,University of South Carolina,"Columbia, South Carolina",Columbia,South Carolina,34.050988,-80.820775,2,5,0,...,http://news.google.com/newspapers?nid=1891&dat...,http://news.google.com/newspapers?nid=1338&dat...,,,,Unknown,MS,1,,6/21/2016
9,10,Valley High School,"Las Vegas, Nevada",Las Vegas,Nevada,36.189319,-115.326487,1,2,0,...,http://www.leagle.com/xmlResult.aspx?page=1&xm...,http://news.google.com/newspapers?nid=1345&dat...,http://www.lasvegassun.com/news/1999/apr/23/sl...,,,Unknown,MS,1,,6/21/2016


In [5]:
list(shooters)

['CaseID',
 'Title',
 'Location',
 'City',
 'State',
 'Latitude',
 'Longitude',
 'Number of Civilian Fatalities',
 'Number of Civilian Injured',
 'Number of Enforcement Fatalities',
 'Number of Enforcement Injured',
 'Total Number of Fatalities',
 'Total Number of Victims',
 'Description',
 'Date',
 'Day of Week',
 'Date - Detailed',
 'Shooter Name',
 'Number of shooters',
 'Shooter Age(s)',
 'Average Shooter Age',
 'Shooter Sex',
 'Shooter Race',
 'Type of Gun - Detailed',
 'Type of Gun - General',
 'Number of Shotguns',
 'Number of Rifles',
 'Number of Handguns',
 'Total Number of Guns',
 'Number of Automatic Guns',
 'Number of Semi-Automatic Guns',
 'Fate of Shooter at the scene',
 'Fate of Shooter',
 "Shooter's Cause of Death",
 'School Related',
 'Place Type',
 'Relationship to Incident Location',
 'Targeted Victim/s - Detailed',
 'Targeted Victim/s - General',
 'Possible Motive - Detailed',
 'Possible Motive - General',
 'History of Mental Illness - Detailed',
 'History of Mental

I'm curious what's going to happen if I run a market basket analysis. Let's clean up the data a little more and see what happens. I'm working off of the following tutorial:
http://pbpython.com/market-basket-analysis.html

In [75]:
cleaned_shooters = shooters.copy(deep=True)

#Clean gun types
cleaned_shooters['Type of Gun - General'] = cleaned_shooters['Type of Gun - General'].str.strip()
cleaned_shooters['Type of Gun - General'] = cleaned_shooters['Type of Gun - General'].str.lower()
cleaned_shooters['Type of Gun - General'].unique()

array(['multiple guns', 'handgun', 'shotgun', 'rifle', 'unknown',
       'semi-automatic rifle', '9-mm'], dtype=object)

In [76]:
#Clean gun counts
guncount_types = ['Number of Shotguns','Number of Handguns','Number of Rifles']
for guncount_type in guncount_types:
    cleaned_shooters[guncount_type].replace('Unknown', 0, inplace=True)

In [77]:
#Clean military experience
cleaned_shooters['Military Experience'] = cleaned_shooters['Military Experience'].str.lower()
cleaned_shooters['Military Experience'].replace('yes', True, inplace=True)
cleaned_shooters['Military Experience'].replace(['no', 'unknown'], False, inplace=True)

In [78]:
#Set civilian + law enforcement casualties, injuries to bools
cleaned_shooters['Number of Civilian Fatalities'] = cleaned_shooters['Number of Civilian Fatalities'] > 0
cleaned_shooters['Number of Civilian Injured'] = cleaned_shooters['Number of Civilian Injured'] > 0
cleaned_shooters['Number of Enforcement Fatalities'] = cleaned_shooters['Number of Enforcement Fatalities'] > 0
cleaned_shooters['Number of Enforcement Injured'] = cleaned_shooters['Number of Enforcement Injured'] > 0
cleaned_shooters['High Civilian Casualties'] = (shooters['Number of Civilian Fatalities'] + shooters['Number of Civilian Injured']) > 10

In [79]:
#Corner cases (from errors)
cleaned_shooters['Number of Shotguns'].replace('Handgun', 0, inplace=True)
cleaned_shooters['Number of Rifles'].replace('0 (1)', 0, inplace=True)
cleaned_shooters['Number of Handguns'].replace('2 (1)', 0, inplace=True)

In [80]:
#Set gun types to bools
cleaned_shooters['Number of Shotguns'] = pd.to_numeric(cleaned_shooters['Number of Shotguns']) > 0
cleaned_shooters['Number of Rifles'] = pd.to_numeric(cleaned_shooters['Number of Rifles']) > 0
cleaned_shooters['Number of Handguns'] = pd.to_numeric(cleaned_shooters['Number of Handguns']) > 0

In [82]:
#We'll iterate over our desired features and flatten them s.t.
#each unique value has a column, and if the incident has that value
#we'll just assign the matching column a 1 and the other ones a 0
marketbasket = cleaned_shooters[[
    'Number of Civilian Fatalities',
    'Number of Civilian Injured',
    'Number of Enforcement Fatalities',
    'Number of Enforcement Injured',
    'Military Experience',
    'Number of Shotguns',
    'Number of Rifles',
    'Number of Handguns',
    'High Civilian Casualties'
]]


#Get the logical inverses as well
marketbasket['No Civilian Fatalities'] = ~marketbasket['Number of Civilian Fatalities']
marketbasket['No Civilian Injured'] = ~marketbasket['Number of Civilian Injured']
marketbasket['No Enforcement Fatalities'] = ~marketbasket['Number of Enforcement Fatalities']
marketbasket['No Enforcement Injured'] = ~marketbasket['Number of Enforcement Injured']


##TODO: Need to finish generating the rest of the dataframe by expanding out unique values across columns
desired_features = [
    'Military Experience',
    'Number of En'
]

marketbasket

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Number of Civilian Fatalities,Number of Civilian Injured,Number of Enforcement Fatalities,Number of Enforcement Injured,Military Experience,Number of Shotguns,Number of Rifles,Number of Handguns,High Civilian Casualties,No Civilian Fatalities,No Civilian Injured,No Enforcement Fatalities,No Enforcement Injured
0,True,True,True,False,True,True,True,True,True,False,False,False,True
1,True,True,False,False,False,False,False,True,False,False,False,True,True
2,True,True,True,True,False,False,True,True,True,False,False,False,False
3,True,True,False,False,False,False,False,True,False,False,False,True,True
4,True,True,False,False,False,True,True,False,False,False,False,True,True
5,True,True,True,False,False,True,False,False,False,False,False,False,True
6,True,True,False,False,False,False,True,False,False,False,False,True,True
7,True,True,False,True,False,False,True,False,False,False,False,True,False
8,True,True,False,False,False,False,False,True,False,False,False,True,True
9,True,True,False,False,False,False,False,True,False,False,False,True,True


In [86]:
#Apply apriori market basket algorithm
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
apriori_res = apriori(marketbasket, min_support=0.1, use_colnames=True)

In [87]:
apriori_res['itemsets'] = apriori_res['itemsets'].apply(lambda x: ''.join(e + ', ' for e in x))

In [88]:
pd.options.display.max_colwidth = 300
apriori_res.sort_values('support')

Unnamed: 0,support,itemsets
122,0.101493,"Number of Civilian Fatalities, Number of Civilian Injured, Number of Handguns, High Civilian Casualties, No Enforcement Fatalities, No Enforcement Injured,"
120,0.101493,"Number of Civilian Fatalities, Number of Handguns, No Civilian Injured, No Enforcement Fatalities, No Enforcement Injured,"
77,0.101493,"Number of Rifles, Number of Handguns, No Enforcement Fatalities,"
119,0.101493,"Number of Civilian Fatalities, Number of Handguns, High Civilian Casualties, No Enforcement Fatalities, No Enforcement Injured,"
78,0.101493,"Number of Rifles, Number of Handguns, No Enforcement Injured,"
88,0.101493,"Number of Civilian Fatalities, Number of Civilian Injured, Number of Shotguns, No Enforcement Injured,"
113,0.101493,"Number of Handguns, No Civilian Injured, No Enforcement Fatalities, No Enforcement Injured,"
102,0.104478,"Number of Civilian Fatalities, Number of Handguns, No Civilian Injured, No Enforcement Injured,"
82,0.104478,"Number of Handguns, No Civilian Injured, No Enforcement Fatalities,"
121,0.104478,"Number of Civilian Injured, Number of Handguns, High Civilian Casualties, No Enforcement Fatalities, No Enforcement Injured,"


In [None]:
#Group the dataset by year
shooters['Date'] = pd.to_datetime(shooters['Date'])
dategroups = shooters.groupby(shooters.Date.dt.year)


#Iterate over groups, count each gun type
for name, grouped in dategroups:
    gun_counts = grouped['Type of Gun - General'].count().transpose()
    print(name)
    print(gun_counts)
    


In [None]:
from altair import *
chart = Chart(data).mark_area(
    stacked='center',
).encode(
    color=Color('series:N',
        scale=Scale(
            range='category20b',
        ),
    ),
    x=X('date:T',
        axis=Axis(
            axisWidth=0.0,
            format='%Y',
            labelAngle=0.0,
            tickSize=0.0,
        ),
        scale=Scale(
            nice='month',
        ),
        timeUnit='yearmonth',
    ),
    y=Y('sum(count):Q',
        axis=False,
    ),
).configure_cell(
    height=200.0,
    width=300.0,
)