# US Gun Violence

## Importing required libraries

In [1]:
# importing the required libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import os

## Accessing Directory and files paths

In [2]:
# location of the downloaded datasets (LOCAL)

path = r'C:\Users\pryns\OneDrive\Desktop\Dataset'

In [3]:
# get the list of all the data files from location

for i in os.walk(path,topdown = False):
    files = i[2]

In [4]:
# list of all the files available in the dataset directory

files

['2014_mass_shootings.csv',
 '2015_mass_shootings.csv',
 '2016_mass_shootings.csv',
 '2017_mass_shootings.csv',
 '2018_mass_shootings.csv',
 '2019_mass_shootings.csv',
 '2020_mass_shootings.csv',
 '2020_to_2023_mass_shootings.csv',
 '2021_mass_shootings.csv',
 '2022_mass_shootings.csv',
 'us_census_bureau_regions_divisions.csv',
 'us_state_code.csv']

- **Individual Year Datasets**: '2014_mass_shootings.csv', '2015_mass_shootings.csv', '2016_mass_shootings.csv', '2017_mass_shootings.csv', '2018_mass_shootings.csv', '2019_mass_shootings.csv', '2020_mass_shootings.csv', '2021_mass_shootings.csv', '2022_mass_shootings.csv'
- **Combined data file for 4 years** - '2020_to_2023_mass_shootings.csv' from which we are going to extract data for 2023

In [5]:
# Creating filepath(s) for each file in order to access each data separately

filepaths = []

for i in files:
    filepaths.append(os.path.join(path,i))

In [6]:
filepaths

['C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2014_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2015_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2016_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2017_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2018_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2019_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2020_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2020_to_2023_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2021_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\2022_mass_shootings.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\us_census_bureau_regions_divisions.csv',
 'C:\\Users\\pryns\\OneDrive\\Desktop\\Dataset\\us_state_code.csv']

## Looking into one data file

In [7]:
# reading and checking 2014 data

data_2014 = pd.read_csv(filepaths[0])

In [8]:
data_2014.head()

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested,Operations
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4,0,0,0,
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3,0,0,0,
2,270036,"December 27, 2014",California,Sacramento,4000 block of May Street,0,4,0,0,0,
3,269167,"December 26, 2014",Illinois,East St. Louis,2500 block of Summit Avenue,1,3,0,0,0,
4,268598,"December 24, 2014",Missouri,Saint Louis,18th and Pine,1,3,0,0,0,


In [9]:
data_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Incident ID        272 non-null    int64  
 1   Incident Date      272 non-null    object 
 2   State              272 non-null    object 
 3   City Or County     272 non-null    object 
 4   Address            272 non-null    object 
 5   Victims Killed     272 non-null    int64  
 6   Victims Injured    272 non-null    int64  
 7   Suspects Killed    272 non-null    int64  
 8   Suspects Injured   272 non-null    int64  
 9   Suspects Arrested  272 non-null    int64  
 10  Operations         0 non-null      float64
dtypes: float64(1), int64(6), object(4)
memory usage: 23.5+ KB


In [11]:
data_2014.columns

Index(['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address',
       'Victims Killed', 'Victims Injured', 'Suspects Killed',
       'Suspects Injured', 'Suspects Arrested', 'Operations'],
      dtype='object')

In [12]:
# check negative values
# `.values` to convert to numpy array and apply any() across rows

(data_2014.select_dtypes('int64').values < 0).any()

False

- no null or negative values found for the required columns, except "operations" which is not required.

In [13]:
# lets verfiy the shapes of the current data set

data_2014.shape

(272, 11)

- there are 11 columns and 272 rows

## Verifying all the data sets

- checking the no. of columns, rows and header labels

In [19]:
columns_check = []
rows_check = []
headers = set()

for i in filepaths[0:10]:
    df = pd.read_csv(i)
    columns_check.append(df.shape[1]) # check the no. of columns
    rows_check.append(df.shape[0]) # check no. of rows
    
            # check unique headers
    headers.update(df.columns)

In [23]:
print(len(headers),headers)
print(columns_check)
print(rows_check)

11 {'Suspects Injured', 'Victims Injured', 'Incident Date', 'Operations', 'City Or County', 'Incident ID', 'Victims Killed', 'Suspects Arrested', 'Address', 'State', 'Suspects Killed'}
[11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
[272, 333, 383, 347, 335, 414, 610, 2000, 689, 645]


In [49]:
sum(rows_check)

6028

* The headers are good and have the same headers for all the data sets

## Combining required data into a single data set - 2014 to 2023

In [28]:
# creating an empty dataset

df0 = pd.DataFrame([],columns = data_2014.columns)

In [29]:
# combining all the data till 2023

for i in filepaths[0:10]:
    df = pd.read_csv(i)
    df0 = pd.concat([df0,df], ignore_index = True)

In [31]:
# 2014 to 2022 data

df0.head()

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested,Operations
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4,0,0,0,
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3,0,0,0,
2,270036,"December 27, 2014",California,Sacramento,4000 block of May Street,0,4,0,0,0,
3,269167,"December 26, 2014",Illinois,East St. Louis,2500 block of Summit Avenue,1,3,0,0,0,
4,268598,"December 24, 2014",Missouri,Saint Louis,18th and Pine,1,3,0,0,0,


In [32]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6028 entries, 0 to 6027
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Incident ID        6028 non-null   object
 1   Incident Date      6028 non-null   object
 2   State              6028 non-null   object
 3   City Or County     6028 non-null   object
 4   Address            6005 non-null   object
 5   Victims Killed     6028 non-null   object
 6   Victims Injured    6028 non-null   object
 7   Suspects Killed    6028 non-null   object
 8   Suspects Injured   6028 non-null   object
 9   Suspects Arrested  6028 non-null   object
 10  Operations         0 non-null      object
dtypes: object(11)
memory usage: 518.2+ KB


- 23 address columns are empty
- Operations columns is not required

In [33]:
# keeping the original combined data intact and creating a copy

df = df0.copy()

In [35]:
df.shape

(6028, 11)

## Analysing and Cleaning the combined data

In [38]:
# dropping the Operations column

df.drop("Operations",axis=1,inplace = True)

In [106]:
df.head(2)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4,0,0,0
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3,0,0,0


- since all the data was combined, there will be duplicate rows.

In [108]:
# dropping duplicate values

df.drop_duplicates(inplace = True)

In [110]:
df.head(2)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4,0,0,0
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3,0,0,0


In [112]:
# after removing duplicates, there are unique values and the no. of rows have reduced from 6028 to 4521

df.shape

(4521, 10)

In [113]:
df.head()

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4,0,0,0
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3,0,0,0
2,270036,"December 27, 2014",California,Sacramento,4000 block of May Street,0,4,0,0,0
3,269167,"December 26, 2014",Illinois,East St. Louis,2500 block of Summit Avenue,1,3,0,0,0
4,268598,"December 24, 2014",Missouri,Saint Louis,18th and Pine,1,3,0,0,0


In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4521 entries, 0 to 4520
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Incident ID        4521 non-null   object
 1   Incident Date      4521 non-null   object
 2   State              4521 non-null   object
 3   City Or County     4521 non-null   object
 4   Address            4507 non-null   object
 5   Victims Killed     4521 non-null   object
 6   Victims Injured    4521 non-null   object
 7   Suspects Killed    4521 non-null   object
 8   Suspects Injured   4521 non-null   object
 9   Suspects Arrested  4521 non-null   object
dtypes: object(10)
memory usage: 388.5+ KB


### Datetime conversion

* Converting the dates that are in string to datetime type

In [116]:
df['Incident Date'] = pd.to_datetime(df['Incident Date'])

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4521 entries, 0 to 4520
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Incident ID        4521 non-null   object        
 1   Incident Date      4521 non-null   datetime64[ns]
 2   State              4521 non-null   object        
 3   City Or County     4521 non-null   object        
 4   Address            4507 non-null   object        
 5   Victims Killed     4521 non-null   object        
 6   Victims Injured    4521 non-null   object        
 7   Suspects Killed    4521 non-null   object        
 8   Suspects Injured   4521 non-null   object        
 9   Suspects Arrested  4521 non-null   object        
dtypes: datetime64[ns](1), object(9)
memory usage: 388.5+ KB


In [118]:
df.head()

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,0,0,0
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,0,0,0
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,0,0,0
3,269167,2014-12-26,Illinois,East St. Louis,2500 block of Summit Avenue,1,3,0,0,0
4,268598,2014-12-24,Missouri,Saint Louis,18th and Pine,1,3,0,0,0


- checking if the incident ID is distinct

In [131]:
# Incident ID is unique; no duplicates

df["Incident ID"].unique().shape

(4521,)

## spliting the dataset for POWER BI consumption

In [143]:
df.columns

Index(['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address',
       'Victims Killed', 'Victims Injured', 'Suspects Killed',
       'Suspects Injured', 'Suspects Arrested'],
      dtype='object')

In [145]:
df_1 = df[['Incident ID', 'State', 'City Or County', 'Address']]
df_2 = df[['Incident ID', 'Incident Date','Victims Killed', 'Victims Injured', 'Suspects Killed','Suspects Injured', 'Suspects Arrested']]

In [147]:
df_1.head()

Unnamed: 0,Incident ID,State,City Or County,Address
0,271363,Louisiana,New Orleans,Poydras and Bolivar
1,269679,California,Los Angeles,8800 block of South Figueroa Street
2,270036,California,Sacramento,4000 block of May Street
3,269167,Illinois,East St. Louis,2500 block of Summit Avenue
4,268598,Missouri,Saint Louis,18th and Pine


In [148]:
df_2.head()

Unnamed: 0,Incident ID,Incident Date,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested
0,271363,2014-12-29,0,4,0,0,0
1,269679,2014-12-27,1,3,0,0,0
2,270036,2014-12-27,0,4,0,0,0
3,269167,2014-12-26,1,3,0,0,0
4,268598,2014-12-24,1,3,0,0,0


## exporting the datasets to .csv for further analysis in POWER BI

In [149]:
# exporting data to local storage

df.to_csv(r'C:\Users\pryns\OneDrive\Desktop\Dataset\Processed Dataset for POWER BI\incidents_master.csv',index = False)
df_1.to_csv(r'C:\Users\pryns\OneDrive\Desktop\Dataset\Processed Dataset for POWER BI\incidents_geographic_location.csv',index = False)
df_2.to_csv(r'C:\Users\pryns\OneDrive\Desktop\Dataset\Processed Dataset for POWER BI\incidents_data.csv',index = False)