# FINAL PROJECT PHASE II
by Reed Walker, Harrison Melton

In [2]:
import csv
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

## Downloaded Dataset Requirement

In [30]:
'''
Source: https://www.ngdc.noaa.gov/hazard/eq-intensity.shtml
This data set is a replacement for our old dataset about political affiliation.
It is a dataset of earthquakes.
We are looking to see if there is a relationship between ufo sightings and earthquakes in the area.
Size: 157,016 rows, 18 columns, 12.4MB
'''

def data_parser():
    data = pd.read_excel('equake.xlsx')
    
    # CLEAN - drop all the unneccessary columns
    data.drop(['HOUR','MINUTE','SECOND','LOCAL_TO_UTC','UNPUB_OR_GROUPED_INT','LATITUDE','LONGITUDE','EQ_DEPTH','EPIDIST','CITY_LAT','CITY_LON','MMI','SOURCE'], inplace=True, axis=1)
    
    # CLEAN - drop null values
    data.dropna(inplace=True)

    # CLEAN - sort the values by state then, year
    data.sort_values(by = ["STATE","YEAR"], inplace=True)
    
    # CLEAN - annex Canada
    data = data[data.COUNTRY != "CANADA"]

    # export to csv
    data.to_csv('qr1.csv', index=False)
    
    return data

############ Function Call ############
data_parser()

Unnamed: 0,YEAR,MONTH,DAY,MAGNITUDE,STATE,CITY,COUNTRY
3759,1899,9.0,10.0,7.8,AK,CAPE YAKATAGA,USA
4035,1899,9.0,10.0,8.6,AK,YAKUTAT BAY,USA
4036,1899,9.0,10.0,8.6,AK,SKAGWAY,USA
4037,1899,9.0,10.0,8.6,AK,BARTLETT BAY,USA
4038,1899,9.0,10.0,8.6,AK,DUNDAS BAY,USA
...,...,...,...,...,...,...,...
135006,1985,11.0,17.0,3.5,WY,OLD FAITHFUL,USA
135007,1985,11.0,17.0,3.1,WY,OLD FAITHFUL,USA
135008,1985,11.0,19.0,2.8,WY,MADISON JUNCTION,USA
135016,1985,11.0,20.0,3.1,WY,MADISON JUNCTION,USA


## Web Collection Requirement \#1


In [21]:
def api_parser():
    
    # 1 - Base URL
    url = 'https://api.usa.gov/crime/fbi/sapi/' # source

    # 2 - Request Parameters
    offense = 'drug-grand-total'
    variable = 'monthly'
    since = '2000'
    until = '2019'

    # 3 - API Key
    key = r'?API_KEY=gw18Vys2MaENl190DI6O0I6bKYW0IaujhZK4bHYE'

    state_dict = {}
    with open('state_data.json', 'r') as state_data:
        state_dict = json.load(state_data)

    for state in state_dict.keys():
        endpoint = f'api/data/arrest/states/{state}/{offense}/{variable}/{since}/{until}'
        r = requests.get(url + endpoint + key)
        rlist = r.json()['results'] # each item(dict) corresponds to a year
        for year in rlist:
            state_dict[state].update({year['data_year']: (year['male_count'] + year['female_count'])})
            
    
    # 4 - CLEAN: remove florida, because it's florida... also so many missing values
    state_dict.pop('FL')    
    
    with open('drug_data.json', 'w') as drug_data:
        json.dump(state_dict, drug_data, indent=4)

    return state_dict

############ Function Call ############
api_parser()

{'AL': {2000: 12292,
  2001: 12932,
  2002: 14100,
  2003: 16133,
  2004: 18315,
  2005: 15704,
  2006: 14860,
  2007: 17309,
  2008: 16488,
  2009: 15180,
  2010: 8213,
  2011: 300,
  2012: 306,
  2013: 325,
  2014: 220,
  2015: 9411,
  2016: 10327,
  2017: 10541,
  2018: 10349,
  2019: 378},
 'AK': {2000: 1763,
  2001: 1592,
  2002: 1643,
  2003: 2132,
  2004: 1755,
  2005: 1885,
  2006: 1701,
  2007: 1759,
  2008: 1725,
  2009: 2060,
  2010: 2493,
  2011: 2349,
  2012: 1547,
  2013: 1475,
  2014: 1159,
  2015: 1212,
  2016: 1006,
  2017: 1004,
  2018: 1046,
  2019: 930},
 'AZ': {2000: 28613,
  2001: 30167,
  2002: 29278,
  2003: 32908,
  2004: 36502,
  2005: 36571,
  2006: 36498,
  2007: 36252,
  2008: 35382,
  2009: 35270,
  2010: 31533,
  2011: 30992,
  2012: 28686,
  2013: 31041,
  2014: 30222,
  2015: 25333,
  2016: 31044,
  2017: 35728,
  2018: 33306,
  2019: 30825},
 'AR': {2000: 13809,
  2001: 11500,
  2002: 9054,
  2003: 12047,
  2004: 12838,
  2005: 14848,
  2006: 15392,
  

## Web Collection Requirement \#2

In [27]:
def html_parser():
    
    ufo_dict = {}
    with open('state_data.json', 'r') as state_data:
        ufo_dict = json.load(state_data)
        
    for key in ufo_dict.keys():
        ufo_dict[key] = []
    
    for state_abbr in ufo_dict.keys():
        url = f'http://www.nuforc.org/webreports/ndxl{state_abbr}.html' # source
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find_all("tr")

        for post in table[1:]:
            # collect attributes
            row = post.find_all("td")

            # ITS CLEANING TIME
            clean = True

            # these are fine I guess
            city = row[1].text.strip()
            state = row[2].text.strip()
            summary = row[5].text.strip()
            posted = row[6].text.strip()

            # these are not
            time = row[0].text.strip()  # split the time into date and time
            time = tuple(time.split())
            if len(time) < 2:
                clean = False 

            shape = row[3].text.strip()  # check to see if the shape is known
            if shape.lower() == "unknown":
                clean = False

            duration = row[4].text.strip()
            if not any(map(str.isdigit, duration)):  # check to see if duration contains a number
                clean = False

            # add if values are non-null and clean
            if (bool(time) * bool(city) * bool(state) * bool(shape) * bool(duration) * bool(summary) * bool(posted) * clean):
                ufo_dict[state_abbr].append((time, city, shape))

    
    with open('ufo_data.json', 'w') as ufo_data:
        json.dump(ufo_dict, ufo_data, indent=4)

    return ufo_dict

############ Function Call ############
html_parser()

{'AL': [(('5/16/21', '09:54'), 'Vincent', 'Light'),
  (('5/9/21', '23:30'), 'Mobile', 'Circle'),
  (('4/27/21', '10:22'), 'Gulf Shores', 'Sphere'),
  (('4/11/21', '20:35'), 'Altoona', 'Other'),
  (('3/5/21', '23:00'), 'Bessemer', 'Triangle'),
  (('2/20/21', '10:15'), 'Fort Payne', 'Cylinder'),
  (('2/13/21', '19:53'), 'Citronelle', 'Other'),
  (('2/9/21', '19:30'), 'Silas', 'Sphere'),
  (('12/26/20', '18:20'), 'Decatur', 'Light'),
  (('12/1/20', '17:50'), 'Birmingham', 'Light'),
  (('11/30/20', '00:40'), 'Huntsville', 'Light'),
  (('10/16/20', '21:35'), 'Billingsley', 'Circle'),
  (('10/13/20', '20:30'), 'robertsdale', 'Changing'),
  (('10/8/20', '05:20'), 'Baileyton', 'Light'),
  (('10/1/20', '05:00'), 'Moundville', 'Changing'),
  (('9/28/20', '15:00'), 'Athens', 'Other'),
  (('9/26/20', '00:34'), 'Huntsville', 'Light'),
  (('9/11/20', '20:20'), 'Huntsville', 'Light'),
  (('9/4/20', '21:30'), 'Gulf Shores', 'Light'),
  (('8/19/20', '22:00'), 'Fort Morgan', 'Circle'),
  (('8/19/20', '1

## Additional Dataset Parsing/Cleaning Functions

Write any supplemental (optional) functions here.

In [8]:
# this needs to be ran for the web based functions to work
def create_state_dict():
    
    # This website has a list of states and their abbreviations
    statesurl = 'https://www.ssa.gov/international/coc-docs/states.html' # source
    r = requests.get(statesurl)
    soup = BeautifulSoup(r.text, 'html.parser')
    states_list = soup.find_all('tr')
    
    # make a list of abbreviations
    abbreviations = []
    for item in states_list:
        info = item.find_all('td')
        abbreviations.append(info[1].text.strip())

    # CLEAN - Need to get rid of the wannabe states
    abbreviations.remove("AS")  # American Samoa
    abbreviations.remove("DC")  # District of Columbia
    abbreviations.remove("GU")  # Guam
    abbreviations.remove("MP")  # Northern Mariana IS
    abbreviations.remove("PR")  # Puerto Rico
    abbreviations.remove("VI")  # Virgin Islands

    # make into a dictionary 
    state_dict = {}
    for state in abbreviations:
        state_dict.update({state:{}})
    
    # export to json
    with open('state_data.json', 'w') as state_data:
        json.dump(state_dict, state_data, indent=4)
    
    return state_dict
    
############ Function Call ############
create_state_dict()

{'AL': {},
 'AK': {},
 'AZ': {},
 'AR': {},
 'CA': {},
 'CO': {},
 'CT': {},
 'DE': {},
 'FL': {},
 'GA': {},
 'HI': {},
 'ID': {},
 'IL': {},
 'IN': {},
 'IA': {},
 'KS': {},
 'KY': {},
 'LA': {},
 'ME': {},
 'MD': {},
 'MA': {},
 'MI': {},
 'MN': {},
 'MS': {},
 'MO': {},
 'MT': {},
 'NE': {},
 'NV': {},
 'NH': {},
 'NJ': {},
 'NM': {},
 'NY': {},
 'NC': {},
 'ND': {},
 'OH': {},
 'OK': {},
 'OR': {},
 'PA': {},
 'RI': {},
 'SC': {},
 'SD': {},
 'TN': {},
 'TX': {},
 'UT': {},
 'VT': {},
 'VA': {},
 'WA': {},
 'WV': {},
 'WI': {},
 'WY': {}}

In [32]:
'''
Source: https://dataverse.harvard.edu/file.xhtml?fileId=4299753&version=6.0
'''

def political_parser():
    
    data = pd.read_csv('elections.csv')
    data.drop(['state','state_fips', 'state_cen', 'state_ic', 'office', 'candidate', 'writein', 'version', 'notes'], inplace=True, axis=1)
    data.dropna(inplace=True)
    data.to_csv('party_affiliation.csv', index=False)
    
    return data
    
############ Function Call ############
political_parser()

Unnamed: 0,year,state_po,party_detailed,candidatevotes,totalvotes,party_simplified
0,1976,AL,DEMOCRAT,659170,1182850,DEMOCRAT
1,1976,AL,REPUBLICAN,504070,1182850,REPUBLICAN
2,1976,AL,AMERICAN INDEPENDENT PARTY,9198,1182850,OTHER
3,1976,AL,PROHIBITION,6669,1182850,OTHER
4,1976,AL,COMMUNIST PARTY USE,1954,1182850,OTHER
...,...,...,...,...,...,...
4278,2020,WI,INDEPENDENT,52,3298041,OTHER
4280,2020,WY,DEMOCRAT,73491,278503,DEMOCRAT
4281,2020,WY,REPUBLICAN,193559,278503,REPUBLICAN
4282,2020,WY,LIBERTARIAN,5768,278503,LIBERTARIAN


# Inconsistencies
For each inconsistency (NaN, null, duplicate values, empty strings, etc.) you discover in your datasets, write at least 2 sentences stating the significance, how you identified it, and how you handled it.

1. null values - earthquake data (downloaded)
Once the unused columns were removed, we used the 'dropna' function of pandas to automatically get rid of the rows with missing values. These missing values, like 'month' for some, are neccesary to match the data with the occurence of UFO sightings in the same month.

2. null values - drug crime data (api)
We had to remove Florida from the list of states as it had null values for the overwhelming majority of years between '00 and '19. We identified this problem plotting the data on a choropleth map using plotly, seeing that it was skewing the rest of our data. Since the data was already compiled into a dictionary with states as the keys, all we had to do was '.pop()' Florida out of it.

3. weak data - ufo sightings (html)
We wanted to make sure that we only had strong reports of ufo sightings to boost the credibility of our findings. To this end, we checked to see if the shape was 'unknown' and if the listed duration contained a concrete number. The shape verification was a simple string comparison and we verified the duration by checking to see if any of the chars were digits. 

4. null values - ufo sightings (html) 
For this, we checked even the unused columns for null values to again boost the credibility of our findings. I thought the way we did this was kinda cool: we multiplied the boolean values of each attribute together with our 'weak data' boolean (#3) to check all of them at once. The weak data and null values were apparent by just glancing over the unaltered file.

5. extra rows - state abbreviations (extra no. 1)
As the state abbreviations list would be used to iterate through both the api and the webpages we were pulling info from, we had to remove any extra abbreviations from the list. After finding that the data listed 56 'states' instead of 50, we just used '.remove()' to get rid of the extra 6. 
