

## Datra Wrangling Projects Part 2:  Web scraping

## Housing Prices in Massachusetts (2010-2019) with relation to Socioeconomic and Demographic Data

### Pavel Makarov

### Web source 1 - Wikipedia

In [77]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [79]:
# Assign the wqebpage adress to a variable
url_1 = "https://en.wikipedia.org/wiki/Demographics_of_Massachusetts"

In [81]:
# Send the request for the assigned url
page = requests.get(url_1)

In [83]:
# Create a soup object from the requested html using BeaitifulSoup package
soup = BeautifulSoup(page.text, 'html')

In [84]:
# Find all tables by looking for <table> object in html
list_of_tables = soup.find_all('table')

In [85]:
# Find the length of the list of tables
len(list_of_tables)

10

In [86]:
# Find Massachusetts demographic data table and create a variable for it 
demog_table = soup.find("table", attrs = {"class": 'us-census-pop us-census-pop-right' })
demog_table

<table class="us-census-pop us-census-pop-right">
<caption>Historical population</caption>
<tbody><tr><th scope="col">Census</th><th scope="col"><abbr title="Population">Pop.</abbr></th><th scope="col"><style data-mw-deduplicate="TemplateStyles:r1152813436">.mw-parser-output .sr-only{border:0;clip:rect(0,0,0,0);clip-path:polygon(0px 0px,0px 0px,0px 0px);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px;white-space:nowrap}</style><span class="sr-only">Note</span></th><th scope="col"><abbr title="Percent change">%±</abbr></th></tr>
<tr><th scope="row"><a href="/wiki/1790_United_States_census" title="1790 United States census">1790</a></th><td>378,787</td><td></td><td>—</td></tr><tr><th scope="row"><a href="/wiki/1800_United_States_census" title="1800 United States census">1800</a></th><td>422,845</td><td></td><td>11.6%</td></tr><tr><th scope="row"><a href="/wiki/1810_United_States_census" title="1810 United States census">1810</a></th><td>472,040</td><td></td><

In [87]:
# Create a variable that will store all found table rows
table_rows = demog_table.find_all('tr')

In [91]:
# Inspect all extracted rows
table_rows

[<tr><th scope="col">Census</th><th scope="col"><abbr title="Population">Pop.</abbr></th><th scope="col"><style data-mw-deduplicate="TemplateStyles:r1152813436">.mw-parser-output .sr-only{border:0;clip:rect(0,0,0,0);clip-path:polygon(0px 0px,0px 0px,0px 0px);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px;white-space:nowrap}</style><span class="sr-only">Note</span></th><th scope="col"><abbr title="Percent change">%±</abbr></th></tr>,
 <tr><th scope="row"><a href="/wiki/1790_United_States_census" title="1790 United States census">1790</a></th><td>378,787</td><td></td><td>—</td></tr>,
 <tr><th scope="row"><a href="/wiki/1800_United_States_census" title="1800 United States census">1800</a></th><td>422,845</td><td></td><td>11.6%</td></tr>,
 <tr><th scope="row"><a href="/wiki/1810_United_States_census" title="1810 United States census">1810</a></th><td>472,040</td><td></td><td>11.6%</td></tr>,
 <tr><th scope="row"><a href="/wiki/1820_United_States_census" title=

In [94]:
# Exctract table row that contains the column names. It comes right after tbody element
if demog_table:
    header_row = demog_table.find('tbody').find('tr') if demog_table.find('tbody') else demog_table.find('tr')
    # From the first table row exctract text from table header element
    column_names = [th.text.strip() for th in header_row.find_all('th')]

In [95]:
# Check if column names were exctracted properly
column_names

['Census', 'Pop.', 'Note', '%±']

In [96]:
# Remove unnecessary column
column_names.remove('Note')

In [97]:
# Change the names for the columns
column_names[0] = 'Census_Year'
column_names[1] = 'Population'
column_names[2] = 'Percent change'

In [98]:
# Using for loop exctract all data table rows that are contained in td elements. Skip first one because it is column names.
parsed_data = []
for rows in table_rows[1:]:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_data.append(line)


In [99]:
# Check the results
parsed_data

[['378,787', '', '—'],
 ['422,845', '', '11.6%'],
 ['472,040', '', '11.6%'],
 ['523,287', '', '10.9%'],
 ['610,408', '', '16.6%'],
 ['737,699', '', '20.9%'],
 ['994,514', '', '34.8%'],
 ['1,231,066', '', '23.8%'],
 ['1,457,351', '', '18.4%'],
 ['1,783,085', '', '22.4%'],
 ['2,238,947', '', '25.6%'],
 ['2,805,346', '', '25.3%'],
 ['3,366,416', '', '20.0%'],
 ['3,852,356', '', '14.4%'],
 ['4,249,614', '', '10.3%'],
 ['4,316,721', '', '1.6%'],
 ['4,690,514', '', '8.7%'],
 ['5,148,578', '', '9.8%'],
 ['5,689,170', '', '10.5%'],
 ['5,737,037', '', '0.8%'],
 ['6,016,425', '', '4.9%'],
 ['6,349,097', '', '5.5%'],
 ['6,547,629', '', '3.1%'],
 ['7,029,917', '', '7.4%'],
 ['6,981,974', '', '−0.7%'],
 ['Sources:[1][2][3]']]

##### The first column is missing becuase it is stored under differnet element type - 'th'

In [101]:
# Usiing for loop exctract all information correpsonding to years
years = []
for rows in table_rows[1:]:
    row_data = rows.find_all('th')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        years.append(line)

In [103]:
# Check the results
years

[['1790'],
 ['1800'],
 ['1810'],
 ['1820'],
 ['1830'],
 ['1840'],
 ['1850'],
 ['1860'],
 ['1870'],
 ['1880'],
 ['1890'],
 ['1900'],
 ['1910'],
 ['1920'],
 ['1930'],
 ['1940'],
 ['1950'],
 ['1960'],
 ['1970'],
 ['1980'],
 ['1990'],
 ['2000'],
 ['2010'],
 ['2020'],
 ['2022 (est.)']]

In [104]:
# create a data frame using the list of years
df_years = pd.DataFrame(data = years, columns = ['Year'])

In [105]:
# Check the data table
df_years

Unnamed: 0,Year
0,1790
1,1800
2,1810
3,1820
4,1830
5,1840
6,1850
7,1860
8,1870
9,1880


In [106]:
# Create a data frame for the rest of the information
df = pd.DataFrame(data = parsed_data)

In [108]:
# Check the data frame
df

Unnamed: 0,0,1,2
0,378787,,—
1,422845,,11.6%
2,472040,,11.6%
3,523287,,10.9%
4,610408,,16.6%
5,737699,,20.9%
6,994514,,34.8%
7,1231066,,23.8%
8,1457351,,18.4%
9,1783085,,22.4%


### Transformation 1 - Data frames concatenation. Since inforamtion for years and population are stored in different data frames it is necessary to combine them into one data frame

In [115]:
# Combine the data frames
df_combined = pd.concat([df_years, df], axis = 1)

In [116]:
# Check combined data frame
df_combined

Unnamed: 0,Year,0,1,2
0,1790,378787,,—
1,1800,422845,,11.6%
2,1810,472040,,11.6%
3,1820,523287,,10.9%
4,1830,610408,,16.6%
5,1840,737699,,20.9%
6,1850,994514,,34.8%
7,1860,1231066,,23.8%
8,1870,1457351,,18.4%
9,1880,1783085,,22.4%


### Transformation 2  - Drop column. There is one empty column exctracted from HTML that has to be dropped.

In [119]:
# Drop empty column 
df_combined = df_combined.drop(columns = 1)

In [120]:
# Check the table
df_combined

Unnamed: 0,Year,0,2
0,1790,378787,—
1,1800,422845,11.6%
2,1810,472040,11.6%
3,1820,523287,10.9%
4,1830,610408,16.6%
5,1840,737699,20.9%
6,1850,994514,34.8%
7,1860,1231066,23.8%
8,1870,1457351,18.4%
9,1880,1783085,22.4%


### Transformation 3 - Drop bottom rows. Two bottom rows of the table were exctracted from HTML source but do not have any meaningful information.

In [123]:
# Remove last two rows
df_combined = df_combined.iloc[:-2]

In [126]:
df_combined

Unnamed: 0,Year,0,2
0,1790,378787,—
1,1800,422845,11.6%
2,1810,472040,11.6%
3,1820,523287,10.9%
4,1830,610408,16.6%
5,1840,737699,20.9%
6,1850,994514,34.8%
7,1860,1231066,23.8%
8,1870,1457351,18.4%
9,1880,1783085,22.4%


### Transformation 4 - Rename columns. Create column names mapping and rename the column with desired names

In [129]:
# Create a column mapping dictionary
column_mapping = {df_combined.columns[0]: column_names[0], df_combined.columns[1]: column_names[1],df_combined.columns[2]: column_names[2] }

In [131]:
# Rename the data frame using mapped column names
df_combined.rename(columns=column_mapping, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined.rename(columns=column_mapping, inplace=True)


In [132]:
# Check the data frame
df_combined

Unnamed: 0,Census_Year,Population,Percent change
0,1790,378787,—
1,1800,422845,11.6%
2,1810,472040,11.6%
3,1820,523287,10.9%
4,1830,610408,16.6%
5,1840,737699,20.9%
6,1850,994514,34.8%
7,1860,1231066,23.8%
8,1870,1457351,18.4%
9,1880,1783085,22.4%


###  Transformation 5 - Data frame slicing. Since this table does cover wide range of years we can sliced to only years of interest.

In [134]:
# Slice only time period that covers 2011-2019
df_combined_sliced = df_combined.loc[22:23]

In [135]:
# Check sliced data frame
df_combined_sliced

Unnamed: 0,Census_Year,Population,Percent change
22,2010,6547629,3.1%
23,2020,7029917,7.4%


In [136]:
# Save the data frame
df_combined_sliced.to_csv('Population_MA.csv')

## Web source 2 - FBI crime data

## 2010 crime

In [139]:
# Copy the address of the wikipedia page and store it as url variable
url = "https://ucr.fbi.gov/crime-in-the-u.s/2010/crime-in-the-u.s.-2010/tables/table-8/10tbl08ma.xls"

In [140]:
# Using requests package get the request from url
page = requests.get(url)

In [142]:
# Create a soup object from the requested html using BeaitifulSoup package 
soup = BeautifulSoup(page.text, 'html')

In [143]:
# Find the length of the list of tables
print(len(soup))

2


In [145]:
# Find specififc table corresponding to crime statistics
report_table = soup.find("table", attrs = {"class": 'data' })
report_table

<table border="0" cellpadding="0" cellspacing="0" class="data" summary="Efforts have been made to make this data table accessible for screen readers; however, if your reader has difficulty with this table, the Excel spreadsheet version is available. Access Key D will take you to the download area.">
<thead>
<tr>
<th class="even group0 alignleft valignmentbottom subthead1" colspan="1" headers="" id="cell40" rowspan="1" scope="col">
City</th>
<th class="odd group1 aligncenter valignmentbottom subthead2" colspan="1" headers="" id="cell41" rowspan="1" scope="col">
Population</th>
<th class="even group2 bold aligncenter valignmentbottom subthead1" colspan="1" headers="" id="cell42" rowspan="1" scope="col">
<b>Violent<br/>crime</b>
</th>
<th class="odd group3 aligncenter valignmentbottom subthead2" colspan="1" headers="" id="cell43" rowspan="1" scope="col">
Murder and<br/>nonnegligent<br/>manslaughter</th>
<th class="even group4 aligncenter valignmentbottom subthead1" colspan="1" headers="" 

In [39]:
# Exctract heraders from the table
header_row = report_table.find('thead').find('tr') if report_table.find('thead') else first_table.find('tr')
column_names = [th.text.strip() for th in header_row.find_all('th')]    

In [40]:
# Check headers
column_names

['City',
 'Population',
 'Violentcrime',
 'Murder andnonnegligentmanslaughter',
 'Forciblerape',
 'Robbery',
 'Aggravatedassault',
 'Propertycrime',
 'Burglary',
 'Larceny-theft',
 'Motorvehicletheft',
 'Arson1']

In [41]:
# Find all table rows in the table
report_rows  = report_table.find_all('tr')
report_rows

[<tr>
 <th class="even group0 alignleft valignmentbottom subthead1" colspan="1" headers="" id="cell40" rowspan="1" scope="col">
 City</th>
 <th class="odd group1 aligncenter valignmentbottom subthead2" colspan="1" headers="" id="cell41" rowspan="1" scope="col">
 Population</th>
 <th class="even group2 bold aligncenter valignmentbottom subthead1" colspan="1" headers="" id="cell42" rowspan="1" scope="col">
 <b>Violent<br/>crime</b>
 </th>
 <th class="odd group3 aligncenter valignmentbottom subthead2" colspan="1" headers="" id="cell43" rowspan="1" scope="col">
 Murder and<br/>nonnegligent<br/>manslaughter</th>
 <th class="even group4 aligncenter valignmentbottom subthead1" colspan="1" headers="" id="cell44" rowspan="1" scope="col">
 Forcible<br/>rape</th>
 <th class="odd group5 aligncenter valignmentbottom subthead2" colspan="1" headers="" id="cell45" rowspan="1" scope="col">
 Robbery</th>
 <th class="even group6 aligncenter valignmentbottom subthead1" colspan="1" headers="" id="cell46" r

In [42]:
# Using for loop exctract all data table rows
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)

In [43]:
# Check first few rows of the list containing data from the HTML table
parsed_df1[0:10]

[['16,836', '46', '0', '2', '9', '35', '346', '112', '213', '21', '3'],
 ['21,090', '24', '1', '2', '1', '20', '217', '43', '167', '7', '0'],
 ['10,275', '21', '0', '2', '2', '17', '146', '61', '75', '10', '0'],
 ['8,120', '35', '0', '4', '1', '30', '249', '65', '175', '9', '2'],
 ['28,215', '47', '0', '6', '5', '36', '260', '75', '155', '30', '7'],
 ['16,549', '44', '0', '5', '0', '39', '266', '52', '201', '13', '2'],
 ['36,004', '81', '0', '20', '2', '59', '560', '274', '257', '29', '1'],
 ['33,505', '15', '0', '2', '4', '9', '331', '75', '239', '17', '3'],
 ['41,212', '96', '0', '0', '9', '87', '635', '154', '460', '21', '8'],
 ['6,050', '13', '0', '2', '0', '11', '83', '23', '56', '4', '1']]

In [44]:
# Column corresponding to the town names is stored under table header element. Parse from 'th' elements of the table
towns = []
for rows in report_rows:
    row_data = rows.find_all('th')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        towns.append(line)

In [45]:
# Check first few rows to verify city names list
towns[0:10]

[['City',
  'Population',
  'Violentcrime',
  'Murder andnonnegligentmanslaughter',
  'Forciblerape',
  'Robbery',
  'Aggravatedassault',
  'Propertycrime',
  'Burglary',
  'Larceny-theft',
  'Motorvehicletheft',
  'Arson1'],
 ['Abington'],
 ['Acton'],
 ['Acushnet'],
 ['Adams'],
 ['Agawam'],
 ['Amesbury'],
 ['Amherst'],
 ['Andover'],
 ['Arlington']]

In [46]:
# Remove first row containing the table hraders
towns = towns [1:]

In [47]:
# Create a data frame containing town names
towns1 = pd.DataFrame(data=towns, columns=['City'])

In [48]:
# Check if data frame was created properly
towns1.head()

Unnamed: 0,City
0,Abington
1,Acton
2,Acushnet
3,Adams
4,Agawam


In [49]:
# Create the data frame containing the other information from the HTML
parsed_df1 = pd.DataFrame(data = parsed_df1)

In [50]:
# Check data frame
parsed_df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,16836,46,0,2,9,35,346,112,213,21,3
1,21090,24,1,2,1,20,217,43,167,7,0
2,10275,21,0,2,2,17,146,61,75,10,0
3,8120,35,0,4,1,30,249,65,175,9,2
4,28215,47,0,6,5,36,260,75,155,30,7


### Transformation 1 - Data frames concatenation. Combine town names data frame and the other infromation into one consolidated data frame using concat() methd from Pandas

In [51]:
# Combine both towns df and parsed df together
combined_report_2010 = pd.concat([towns1, parsed_df1], axis = 1)

In [52]:
# Check the result
combined_report_2010.head()

Unnamed: 0,City,0,1,2,3,4,5,6,7,8,9,10
0,Abington,16836,46,0,2,9,35,346,112,213,21,3
1,Acton,21090,24,1,2,1,20,217,43,167,7,0
2,Acushnet,10275,21,0,2,2,17,146,61,75,10,0
3,Adams,8120,35,0,4,1,30,249,65,175,9,2
4,Agawam,28215,47,0,6,5,36,260,75,155,30,7


### Transformation 2 - Rename columns. Create column mapping dictionary to map out each existing column name wiith desired name. Then apply rename() method with this dictionary

In [53]:
# Create dictionary for column names mapping
column_mapping = {combined_report_2010.columns[0]: column_names[0], 
                  combined_report_2010.columns[1]: column_names[1],
                 combined_report_2010.columns[2]: column_names[2],
                 combined_report_2010.columns[3]: column_names[3],
                 combined_report_2010.columns[4]: column_names[4],
                 combined_report_2010.columns[5]: column_names[5],
                 combined_report_2010.columns[6]: column_names[6],
                 combined_report_2010.columns[7]: column_names[7],
                 combined_report_2010.columns[8]: column_names[8],
                 combined_report_2010.columns[9]: column_names[9],
                 combined_report_2010.columns[10]: column_names[10],
                 combined_report_2010.columns[11]: column_names[11]}

In [54]:
# Rename column using mapped column names from the dictionary
combined_report_2010.rename(columns = column_mapping, inplace = True )

In [55]:
# Check the results
combined_report_2010.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1
0,Abington,16836,46,0,2,9,35,346,112,213,21,3
1,Acton,21090,24,1,2,1,20,217,43,167,7,0
2,Acushnet,10275,21,0,2,2,17,146,61,75,10,0
3,Adams,8120,35,0,4,1,30,249,65,175,9,2
4,Agawam,28215,47,0,6,5,36,260,75,155,30,7


## 2011 Crime

In [56]:
# Apply the same steps to exctract data frome 2011 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2011/crime-in-the-u.s.-2011/tables/table8statecuts/table_8_offenses_known_to_law_enforcement_massachusetts_by_city_2011.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2011 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2011.rename(columns = column_mapping, inplace = True )
combined_report_2011.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1
0,Abington,16082,22,0,1,6,15,351,102,226,23,1
1,Acton,22058,33,0,0,2,31,201,23,173,5,0
2,Acushnet,10366,16,0,0,3,13,120,43,72,5,0
3,Adams,8537,26,0,6,1,19,200,65,124,11,0
4,Agawam,28611,41,0,5,3,33,199,55,114,30,5


## 2012 Crime

In [57]:
# Apply the same steps to exctract data frome 2012 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2012/crime-in-the-u.s.-2012/tables/8tabledatadecpdf/table-8-state-cuts/table_8_offenses_known_to_law_enforcement_by_massachuetts_by_city_2012.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2012 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2012.rename(columns = column_mapping, inplace = True )
combined_report_2012.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1
0,Abington,16215,41,0,1,6,34,358,97,244,17,1
1,Acton,22349,23,0,4,1,18,241,37,202,2,0
2,Acushnet,10408,13,0,0,1,12,139,72,57,10,2
3,Adams,8508,41,0,11,4,26,189,49,135,5,3
4,Agawam,28708,34,0,5,5,24,272,108,140,24,0


## 2013 Crime

In [58]:
# Apply the same steps to exctract data frome 2013 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/table-8/table-8-state-cuts/table_8_offenses_known_to_law_enforcement_massachusetts_by_city_2013.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2013 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2013.rename(columns = column_mapping, inplace = True )
combined_report_2013.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1,11
0,Abington,16121,48,0,5,,7,36,297,82,201,14,1
1,Acton,22871,8,0,1,,2,5,178,31,144,3,0
2,Acushnet,10362,19,0,2,,0,17,74,27,40,7,2
3,Adams,8352,21,0,3,,3,15,157,40,109,8,1
4,Agawam,28659,41,0,12,,3,26,270,105,130,35,1


## 2014 Crime

In [59]:
url = "https://ucr.fbi.gov/crime-in-the-u.s/2014/crime-in-the-u.s.-2014/tables/table-8/table-8-by-state/Table_8_Offenses_Known_to_Law_Enforcement_by_Massachusetts_by_City_2014.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2014 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2014.rename(columns = column_mapping, inplace = True )
combined_report_2014.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1,11
0,Abington,16200,28,0,4,,6,18,338,56,272,10,1
1,Acton,23193,17,0,2,,0,15,179,27,147,5,0
2,Acushnet,10392,23,0,3,,1,19,91,36,50,5,1
3,Adams,8296,30,0,7,,1,22,120,37,78,5,3
4,Agawam,38977,69,0,21,,7,41,284,95,171,18,4


## 2015 Crime

In [60]:
# Apply the same steps to exctract data frome 2015 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2015/crime-in-the-u.s.-2015/tables/table-8/table-8-state-pieces/table_8_offenses_known_to_law_enforcement_massachusetts_by_city_2015.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2015 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2015.rename(columns = column_mapping, inplace = True )
combined_report_2015.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1,11
0,Abington,16252,28,0,2,,3,23,293,52,228,13,0
1,Acton,23573,23,0,2,,1,20,224,61,156,7,0
2,Acushnet,10441,28,0,2,,0,26,101,32,60,9,4
3,Adams,8221,31,0,4,,1,26,150,33,110,7,3
4,Agawam,28853,76,0,13,,5,58,521,193,294,34,1


## 2016 Crime - WAS NOT REPORTED FOR MA

## 2017 Crime

In [61]:
# Apply the same steps to exctract data frome 2017 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2017/crime-in-the-u.s.-2017/tables/table-8/table-8-state-cuts/massachusetts.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2017 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2017.rename(columns = column_mapping, inplace = True )
combined_report_2017.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1
0,Abington,16369,33,0,4,6,23,158,26,120,12,0
1,Acton,23937,23,0,1,1,21,170,33,134,3,1
2,Acushnet,10547,32,0,4,3,25,101,28,62,11,0
3,Adams,8051,20,0,10,1,9,114,46,60,8,0
4,Agawam,28839,66,0,3,9,54,519,178,313,28,3


## 2018 Crime

In [62]:
# Apply the same steps to exctract data frome 2018 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2018/crime-in-the-u.s.-2018/tables/table-8/table-8-state-cuts/massachusetts.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2018 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2018.rename(columns = column_mapping, inplace = True )
combined_report_2018.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1
0,Abington,16443,41,0,5,3,33,147,18,107,22,0
1,Acton,24038,26,1,7,2,16,156,27,125,4,0
2,Acushnet,10576,19,0,5,1,13,69,21,42,6,0
3,Adams,8036,23,1,1,2,19,96,32,60,4,1
4,Agawam,28955,105,0,14,12,79,428,142,268,18,0


## 2019 Crime

In [63]:
# Apply the same steps to exctract data frome 2019 crime report HTML source
url = "https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/tables/table-8/table-8-state-cuts/massachusetts.xls"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
report_table = soup.find("table", attrs = {"class": 'data' })
report_rows  = report_table.find_all('tr')
parsed_df1 = []
for rows in report_rows:
    row_data = rows.find_all('td')
    if row_data:
        line = [data.get_text(strip=True) for data in row_data]
        parsed_df1.append(line)
parsed_df1 = pd.DataFrame(data = parsed_df1)
combined_report_2019 = pd.concat([towns1, parsed_df1], axis = 1)
combined_report_2019.rename(columns = column_mapping, inplace = True )
combined_report_2019.head()

Unnamed: 0,City,Population,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1
0,Abington,16448,23,4,5,3,11,153,23,122,8,1
1,Acton,23780,32,0,6,2,24,66,13,50,3,0
2,Acushnet,10533,12,0,5,0,7,35,14,19,2,0
3,Adams,8028,26,0,10,2,14,94,34,59,1,2
4,Agawam,28736,82,0,13,8,61,376,133,228,15,1


### Transformation 3 - Drop unnecessary columns. Population column is not necessary for this study it can be removed from the data frame using drop() method from pandas.

In [64]:
# Drop population column from all data frames 
combined_report_2010.drop('Population', axis = 1, inplace = True)
combined_report_2011.drop('Population', axis = 1, inplace = True)
combined_report_2012.drop('Population', axis = 1, inplace = True)
combined_report_2013.drop('Population', axis = 1, inplace = True)
combined_report_2014.drop('Population', axis = 1, inplace = True)
combined_report_2015.drop('Population', axis = 1, inplace = True)
combined_report_2017.drop('Population', axis = 1, inplace = True)
combined_report_2018.drop('Population', axis = 1, inplace = True)
combined_report_2019.drop('Population', axis = 1, inplace = True)

### Transformation 4 -  Find and remove missing values. Find if there are anyh missing values and apply dropna to each data frame

In [65]:
# Check if there are any missing values in 2017 crime table
combined_report_2017.isnull().sum()

City                                  0
Violentcrime                          2
Murder andnonnegligentmanslaughter    2
Forciblerape                          2
Robbery                               2
Aggravatedassault                     2
Propertycrime                         2
Burglary                              2
Larceny-theft                         2
Motorvehicletheft                     2
Arson1                                2
dtype: int64

#### Each column has missing values

In [66]:
# Drop rows with missing values from all reports
combined_report_2010 =combined_report_2010.dropna()
combined_report_2011 =combined_report_2011.dropna()
combined_report_2012 =combined_report_2012.dropna()
combined_report_2013 =combined_report_2013.dropna()
combined_report_2014 =combined_report_2014.dropna()
combined_report_2015 =combined_report_2015.dropna()
combined_report_2017 =combined_report_2017.dropna()
combined_report_2018 =combined_report_2018.dropna()
combined_report_2019 =combined_report_2019.dropna()

In [67]:
# Check the report for 2017 to verify there is no missing values
combined_report_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274 entries, 0 to 273
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   City                                274 non-null    object
 1   Violentcrime                        274 non-null    object
 2   Murder andnonnegligentmanslaughter  274 non-null    object
 3   Forciblerape                        274 non-null    object
 4   Robbery                             274 non-null    object
 5   Aggravatedassault                   274 non-null    object
 6   Propertycrime                       274 non-null    object
 7   Burglary                            274 non-null    object
 8   Larceny-theft                       274 non-null    object
 9   Motorvehicletheft                   274 non-null    object
 10  Arson1                              274 non-null    object
dtypes: object(11)
memory usage: 25.7+ KB


### Transformation 5 - Combine all data frames into one. This step will allow to easily manipulate and summarize total number of crimes for each year in each city.

In [114]:
year = ['2010', '2011', '2012', '2013', '2014', '2015', '2017', '2018', '2019']

In [68]:
# Create a list of all crime reports
dataframes_list = [combined_report_2010, combined_report_2011, combined_report_2012, combined_report_2013, combined_report_2014, combined_report_2015, combined_report_2017, combined_report_2018, combined_report_2019]


In [127]:
# Combine all data frames together and label columns for each year
combined_df = pd.concat(dataframes_list, keys=['2010', '2011', '2012', '2013', '2014', '2015', '2017', '2018', '2019'], axis=0)


In [128]:
# check the results
combined_df.head()

Unnamed: 0,Unnamed: 1,City,Violentcrime,Murder andnonnegligentmanslaughter,Forciblerape,Robbery,Aggravatedassault,Propertycrime,Burglary,Larceny-theft,Motorvehicletheft,Arson1,11
2010,0,Abington,46,0,2,9,35,346,112,213,21,3,
2010,1,Acton,24,1,2,1,20,217,43,167,7,0,
2010,2,Acushnet,21,0,2,2,17,146,61,75,10,0,
2010,3,Adams,35,0,4,1,30,249,65,175,9,2,
2010,4,Agawam,47,0,6,5,36,260,75,155,30,7,


In [129]:
combined_df.to_csv('MA_Crime_reports_2010-2019.csv')