In [1]:
# Import required packages
import requests 
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
# Load the webpage
url = "https://en.wikipedia.org/wiki/List_of_school_shootings_in_the_United_States"
r = requests.get(url)

# Convert to a beautiful soup object
webpage = bs(r.content)

In [3]:
# Get all tables on the webpage
all_tables = webpage.select("table")
# Select only the 3 tables on school shootings
tables = all_tables[1:4]

In [4]:
# Get each individual table
table_2000s = tables[0]
table_2010s = tables[1]
table_2020s = tables[2]

In [5]:
# Note: all three tables have the same column names
# Get the table columns
columns = table_2000s.find("tr").find_all("th")
# Get the column names 
column_names = [c.string.strip() for c in columns]
print(column_names)

['Date', 'Location', 'Deaths', 'Injuries', 'Description']


In [6]:
# Function to create a pandas dataframe from table data
# Input: table to scrape data from, list of column names
# Output: pandas dataframe containing data

def create_df(table,colnames):
    # get all table rows except for header
    table_rows = table.select("tr")[1:]
    # create empty list of list to store rows
    ls = []

    # get data for each table row as list and store in another list
    for tr in table_rows:
        td = tr.find_all("td")
        row = [tr.get_text().strip() for tr in td]
        ls.append(row)

    # create a pandas dataframe from list of list
    df = pd.DataFrame(ls, columns = colnames)
    return(df)

In [7]:
df_2000s = create_df(table_2000s, column_names)
df_2000s.head()

Unnamed: 0,Date,Location,Deaths,Injuries,Description
0,"February 29, 2000","Flint, Michigan",1,0,Shooting of Kayla Rolland: At Buell Elementary...
1,"May 26, 2000","Lake Worth, Florida",1,0,"13-year-old honor student, Nathaniel Brazill w..."
2,"June 28, 2000","Seattle, Washington",2[n 1],0,58-year-old Director of the Division of Pathol...
3,"August 28, 2000","Fayetteville, Arkansas",2[n 1],0,"36-year-old James Easton Kelly, a PhD candidat..."
4,"September 26, 2000","New Orleans, Louisiana",0,2[n 1],13 year-olds Darrel Johnson and Alfred Anderso...


In [8]:
df_2010s = create_df(table_2010s, column_names)
df_2010s.head()

Unnamed: 0,Date,Location,Deaths,Injuries,Description
0,"February 5, 2010","Madison, Alabama",1,0,"14-year-old student, Hammad Memon, killed 14-y..."
1,"February 19, 2010","DeKalb, Illinois",0,1,Less than a week after Northern Illinois Unive...
2,"February 23, 2010","Littleton, Colorado",0,2,"At Deer Creek Middle School, 32-year-old Bruco..."
3,"May 11, 2010","Bladenboro, North Carolina",0,1,A West Bladen High School student was shot on ...
4,"September 8, 2010","Detroit, Michigan",0,2,Two students were wounded in front of Mumford ...


In [9]:
df_2020s = create_df(table_2020s, column_names)
df_2020s.head()

Unnamed: 0,Date,Location,Deaths,Injuries,Description
0,"January 8, 2020","Belle Glade, Florida",0,1,An individual who was not a student accidental...
1,"January 11, 2020","Dallas, Texas",1,1,A 15-year-old was arrested after two people we...
2,"January 14, 2020","Bellaire, Texas",1,0,A 19-year-old student was shot in the chest an...
3,"January 14, 2020","Fort Worth, Texas",0,2,An adult male and a 10-year-old child were inj...
4,"January 23, 2020","Oxnard, California",0,1,A stray bullet fired during a street altercati...


In [10]:
result_df = pd.concat([df_2000s, df_2010s, df_2020s])
result_df

Unnamed: 0,Date,Location,Deaths,Injuries,Description
0,"February 29, 2000","Flint, Michigan",1,0,Shooting of Kayla Rolland: At Buell Elementary...
1,"May 26, 2000","Lake Worth, Florida",1,0,"13-year-old honor student, Nathaniel Brazill w..."
2,"June 28, 2000","Seattle, Washington",2[n 1],0,58-year-old Director of the Division of Pathol...
3,"August 28, 2000","Fayetteville, Arkansas",2[n 1],0,"36-year-old James Easton Kelly, a PhD candidat..."
4,"September 26, 2000","New Orleans, Louisiana",0,2[n 1],13 year-olds Darrel Johnson and Alfred Anderso...
...,...,...,...,...,...
68,"May 18, 2022","Murfreesboro, Tennessee",1,1,Following a graduation ceremony for Riverdale ...
69,"May 19, 2022","Hammond, Louisiana",0,3,Three people were hospitalized after a shootin...
70,"May 19, 2022","Kentwood, Michigan",0,2,Two people were shot outside of East Kentwood ...
71,"May 24, 2022","Uvalde, Texas",22[n 1],18,Robb Elementary School shooting: An 18-year-ol...


In [11]:
# Save the final result df as a csv file in the current directory
result_df.to_csv("school_shootings_US_data_v01.csv", index = False, header = True)