In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# **Read data**

In [2]:
# Read shark attack dataset
shark_attacks = pd.read_csv('/content/drive/MyDrive/OefendataWinc/SharkAttacks.csv', 
                            encoding='unicode_escape')

display(shark_attacks)
display(shark_attacks.columns)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,
25721,,,,,,,,,,,...,,,,,,,,,,


Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

# **Data preparation**

In [3]:
# Functions for detecting missing values
def print_separator(sep, num, msg):
  print('\n')
  print(sep * num)
  print(f'{msg}')
  print(sep * num)


# Tactic 1: Looking at all the unique values
def look_at_unique_values(column):
  unique_values_cutoff = 50
  unique_values = column.unique()
  num_unique_values = len(unique_values)
  if num_unique_values == len(column):
    print(f'Each value in the column is unique (total: {num_unique_values})')
  elif num_unique_values < len(column):
    print(f'Less than {len(column)} unique values (total: {num_unique_values})')
    # We may get an error when sorting
    try:
      sorted = np.sort(unique_values)
      print('Values are sorted')
      display(list(sorted))
    except:
      print('Could not sort values')
      display(list(unique_values))
    else:
      print(f"More than {unique_values_cutoff} unique values (total: {num_unique_values})")


# Tactic 2: Sorting and looking at the edges
def look_at_edges(df, column_name):
  def show_head_and_tail(values):
      num_items_to_slice = 10
      display(list(values)[:num_items_to_slice])
      display(list(values)[-num_items_to_slice:])

  column = df[column_name]
  unique_values = column.unique()
  try:
      sorted = np.sort(unique_values)
      print('Unique values sorted, head and tail:')
      show_head_and_tail(sorted)
  except TypeError as error:
      print(f'Could not sort values: {error}')
      print("..so let's try filtering NULL values and then sorting")
      non_null_uniques = df.loc[~df[column_name].isnull(), column_name].unique()
      sorted = np.sort(non_null_uniques)
      show_head_and_tail(sorted)


# Tactic 3: Casting to a type
def cast_to_type(column, data_type_column):
  try:
    column.astype(data_type_column)
    print(f'Casting to {data_type_column} was successful')
  except ValueError as error:
    print(f'Could not cast to {data_type_column}: {error}')


# Find non-default missing values in the dataset
def find_non_default_missing_values(df, column_name, data_type_column):
  long_separator_amount = 80
  short_separator_amount = 40

  print_separator('*', long_separator_amount, f'Finding non-default missing values for column \'{column_name}\'')

  print(f'Column \'{column_name}\' has datatype: {df.dtypes[column_name]}')

  column = df[column_name]  

  # A Looking at all the unique values
  print_separator('-', short_separator_amount, 'A: Looking at unique values')
  look_at_unique_values(column)

  # B Sorting and looking at the edges
  print_separator('-', short_separator_amount, 'B: Sorting and looking at the edges')
  look_at_edges(df, column_name)

  # C Casting to a type
  print_separator('-', short_separator_amount, f'C: Casting to type: {data_type_column}')
  cast_to_type(column, data_type_column)

  # D Looking at the frequency
  print_separator('-', short_separator_amount, 'D: Looking at frequency')
  display(column.value_counts(dropna=False))

  print('\n')


# Replace values
def replace_value(df, column_name, missing_old, missing_new):
  df[column_name] = df[column_name].replace({missing_old: missing_new})

In [4]:
# Remove unnecessary columns
shark_attacks.drop(columns=['Case Number', 'Name', 'Time', 'Investigator or Source', 'pdf', 
                            'href formula', 'href', 'Case Number.1', 'Case Number.2', 
                            'original order', 'Unnamed: 22', 'Unnamed: 23'], 
                   inplace=True)

# Rename columns
shark_attacks.rename(columns={'Sex ' : 'Sex_Victim', 'Fatal (Y/N)' : 'Fatal_(Y/N)', 
                              'Species ' : 'Species'}, 
                     inplace=True)

display(shark_attacks)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex_Victim,Age,Injury,Fatal_(Y/N),Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,,
25719,,,,,,,,,,,,
25720,,,,,,,,,,,,
25721,,,,,,,,,,,,


In [5]:
# Remove rows with only NaN values
shark_attacks.dropna(how='all', inplace=True)

display(shark_attacks)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex_Victim,Age,Injury,Fatal_(Y/N),Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,
2,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,FATAL,Y,
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,FATAL,Y,
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,FATAL,Y,
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,


In [6]:
# Remove invalid cases from dataset, because these don't count as a shark attack
shark_attacks = shark_attacks[shark_attacks['Type'] != 'Invalid']

display(shark_attacks)

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex_Victim,Age,Injury,Fatal_(Y/N),Species
0,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,
3,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark
4,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"
5,03-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,
...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,FATAL,Y,
6298,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,FATAL,Y,
6299,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,FATAL,Y,
6300,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,


In [7]:
print(shark_attacks.dtypes)
print(shark_attacks.columns)

Date            object
Year           float64
Type            object
Country         object
Area            object
Location        object
Activity        object
Sex_Victim      object
Age             object
Injury          object
Fatal_(Y/N)     object
Species         object
dtype: object
Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Sex_Victim', 'Age', 'Injury', 'Fatal_(Y/N)', 'Species'],
      dtype='object')


In [8]:
# Check Date column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Date', 'string')

# All values are OK



********************************************************************************
Finding non-default missing values for column 'Date'
********************************************************************************
Column 'Date' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 4979)
Values are sorted


['    10-Jan-2009',
 '    15-Jun-1937',
 '    16-Jan-1970',
 '    22-Jul-2013',
 '   21-Sep-1908',
 '  03-Feb-1914',
 '  05-Oct-1985',
 '  10-Jan-1903',
 '  16-Feb-1910',
 '  19-Jul-1889',
 '  21-Jun-1934',
 '  24-Mar-1990',
 '  25-Jun-1982',
 '  28-Jan-1877',
 '  28-Jan-1900',
 '  29-Jul-2013',
 '  29-Oct-2011',
 '  30-Jul-2013',
 '  31-Jul-2013',
 '  Reported 31-Jul-1958',
 ' 01-Dec-1979',
 ' 01-Sep-2013',
 ' 02-Sep-2013',
 ' 04-Sep-2010',
 ' 05-Aug-2013',
 ' 07-Apr-1877',
 ' 07-Sep-2013',
 ' 08-Aug-1890',
 ' 08-Jul-1958',
 ' 11-Jan-1896',
 ' 11-Mar-1877',
 ' 12-Sep-2013',
 ' 13-Aug-2013',
 ' 13-Jan-1999',
 ' 13-Sep-2010',
 ' 14-Aug-2013',
 ' 14-Sep-2013',
 ' 15-Feb-1988',
 ' 18-Aug-2013',
 ' 18-Nov-1948',
 ' 19-Aug-1993',
 ' 19-Feb-2016',
 ' 19-Jul-2004 Reported to have happened  "on the weekend"',
 ' 2-Jul-1997',
 ' 21-Sep-2013',
 ' 22-Jun-1956',
 ' 22-Sep-1879',
 ' 24-Aug-1916',
 ' 25-Aug-2013',
 ' 25-Sep-2013',
 ' 27-Mar-2010',
 ' Jan-1970',
 ' Jul-1898',
 '"Anniversary Day" 22-J

More than 50 unique values (total: 4979)


----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Unique values sorted, head and tail:


['    10-Jan-2009',
 '    15-Jun-1937',
 '    16-Jan-1970',
 '    22-Jul-2013',
 '   21-Sep-1908',
 '  03-Feb-1914',
 '  05-Oct-1985',
 '  10-Jan-1903',
 '  16-Feb-1910',
 '  19-Jul-1889']

['Summer of 1898',
 'Summer of 1903',
 'Summer of 1926',
 'Summer of 1959',
 'Summer of 1981',
 'Summer-2008',
 'Winter 1942',
 'Winter 1969',
 'Woirld War II',
 'World War II']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


1957           9
1942           9
1956           8
1941           7
1958           7
              ..
30-Jun-1995    1
24-Jun-1995    1
23-Jun-1995    1
16-Jun-1995    1
1845-1853      1
Name: Date, Length: 4979, dtype: int64





In [9]:
# Check Year column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Year', 'float64')

replace_value(shark_attacks, 'Year', np.nan, 0.0)
shark_attacks['Year'] = shark_attacks['Year'].astype(int)

replace_value(shark_attacks, 'Year', 0, None)



********************************************************************************
Finding non-default missing values for column 'Year'
********************************************************************************
Column 'Year' has datatype: float64


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 244)
Values are sorted


[0.0,
 5.0,
 77.0,
 500.0,
 1543.0,
 1554.0,
 1555.0,
 1580.0,
 1595.0,
 1617.0,
 1637.0,
 1638.0,
 1642.0,
 1700.0,
 1703.0,
 1721.0,
 1723.0,
 1738.0,
 1742.0,
 1748.0,
 1749.0,
 1751.0,
 1753.0,
 1755.0,
 1758.0,
 1764.0,
 1771.0,
 1776.0,
 1779.0,
 1780.0,
 1783.0,
 1784.0,
 1785.0,
 1786.0,
 1787.0,
 1788.0,
 1791.0,
 1792.0,
 1797.0,
 1800.0,
 1801.0,
 1802.0,
 1803.0,
 1804.0,
 1807.0,
 1808.0,
 1810.0,
 1811.0,
 1812.0,
 1815.0,
 1816.0,
 1817.0,
 1822.0,
 1825.0,
 1826.0,
 1827.0,
 1828.0,
 1829.0,
 1830.0,
 1831.0,
 1832.0,
 1834.0,
 1835.0,
 1836.0,
 1837.0,
 1839.0,
 1840.0,
 1841.0,
 1842.0,
 1844.0,
 1845.0,
 1846.0,
 1847.0,
 1848.0,
 1849.0,
 1850.0,
 1851.0,
 1852.0,
 1853.0,
 1855.0,
 1856.0,
 1857.0,
 1858.0,
 1859.0,
 1860.0,
 1861.0,
 1862.0,
 1863.0,
 1864.0,
 1865.0,
 1866.0,
 1867.0,
 1868.0,
 1869.0,
 1870.0,
 1871.0,
 1872.0,
 1873.0,
 1874.0,
 1875.0,
 1876.0,
 1877.0,
 1878.0,
 1879.0,
 1880.0,
 1881.0,
 1882.0,
 1883.0,
 1884.0,
 1885.0,
 1886.0,
 1887.0,
 

More than 50 unique values (total: 244)


----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Unique values sorted, head and tail:


[0.0, 5.0, 77.0, 500.0, 1543.0, 1554.0, 1555.0, 1580.0, 1595.0, 1617.0]

[2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0, 2018.0, nan]



----------------------------------------
C: Casting to type: float64
----------------------------------------
Casting to float64 was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


2015.0    130
0.0       123
2017.0    122
2016.0    121
2014.0    118
         ... 
1787.0      1
1786.0      1
1785.0      1
1784.0      1
NaN         1
Name: Year, Length: 244, dtype: int64





In [10]:
# Check Type column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Type', 'string')

replace_value(shark_attacks, 'Type', np.nan, None)



********************************************************************************
Finding non-default missing values for column 'Type'
********************************************************************************
Column 'Type' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 8)
Could not sort values


['Boating',
 'Unprovoked',
 'Provoked',
 'Questionable',
 'Sea Disaster',
 nan,
 'Boat',
 'Boatomg']



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


['Boat',
 'Boating',
 'Boatomg',
 'Provoked',
 'Questionable',
 'Sea Disaster',
 'Unprovoked']

['Boat',
 'Boating',
 'Boatomg',
 'Provoked',
 'Questionable',
 'Sea Disaster',
 'Unprovoked']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


Unprovoked      4595
Provoked         574
Sea Disaster     239
Boating          203
Boat             137
NaN                4
Questionable       2
Boatomg            1
Name: Type, dtype: int64





In [11]:
# Check Country column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Country', 'string')

replace_value(shark_attacks, 'Country', np.nan, None)



********************************************************************************
Finding non-default missing values for column 'Country'
********************************************************************************
Column 'Country' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 205)
Could not sort values


['USA',
 'AUSTRALIA',
 'MEXICO',
 'BRAZIL',
 'SOUTH AFRICA',
 'THAILAND',
 'COSTA RICA',
 'MALDIVES',
 'BAHAMAS',
 'NEW CALEDONIA',
 'ECUADOR',
 'MALAYSIA',
 'LIBYA',
 nan,
 'CUBA',
 'MAURITIUS',
 'NEW ZEALAND',
 'SPAIN',
 'SAMOA',
 'SOLOMON ISLANDS',
 'JAPAN',
 'EGYPT',
 'ST HELENA, British overseas territory',
 'REUNION',
 'FRENCH POLYNESIA',
 'UNITED KINGDOM',
 'UNITED ARAB EMIRATES',
 'PHILIPPINES',
 'INDONESIA',
 'CHINA',
 'COLUMBIA',
 'Fiji',
 'DOMINICAN REPUBLIC',
 'ARUBA',
 'MOZAMBIQUE',
 'FIJI',
 'PUERTO RICO',
 'ATLANTIC OCEAN',
 'GREECE',
 'ST. MARTIN',
 'FRANCE',
 'PAPUA NEW GUINEA',
 'TRINIDAD & TOBAGO',
 'KIRIBATI',
 'ISRAEL',
 'DIEGO GARCIA',
 'TAIWAN',
 'PALESTINIAN TERRITORIES',
 'GUAM',
 'SEYCHELLES',
 'BELIZE',
 'JAMAICA',
 'NIGERIA',
 'TONGA',
 'SCOTLAND',
 'ITALY',
 'CHILE',
 'KENYA',
 'RUSSIA',
 'TURKS & CAICOS',
 'AZORES',
 'SOUTH KOREA',
 'MALTA',
 'VIETNAM',
 'MADAGASCAR',
 'UNITED ARAB EMIRATES (UAE)',
 'PANAMA',
 'SOMALIA',
 'CROATIA',
 'ENGLAND',
 'NORWAY',




----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' PHILIPPINES',
 ' TONGA',
 'ADMIRALTY ISLANDS',
 'AFRICA',
 'ALGERIA',
 'AMERICAN SAMOA',
 'ANDAMAN / NICOBAR ISLANDAS',
 'ANDAMAN ISLANDS',
 'ANGOLA',
 'ANTIGUA']

['UNITED KINGDOM',
 'URUGUAY',
 'USA',
 'VANUATU',
 'VENEZUELA',
 'VIETNAM',
 'WEST INDIES',
 'WESTERN SAMOA',
 'YEMEN',
 'YEMEN ']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


USA                       2036
AUSTRALIA                 1219
SOUTH AFRICA               514
PAPUA NEW GUINEA           132
NEW ZEALAND                116
                          ... 
MALDIVE ISLANDS              1
NICARAGUA                    1
NORTH SEA                    1
RED SEA / INDIAN OCEAN       1
CEYLON (SRI LANKA)           1
Name: Country, Length: 205, dtype: int64





In [12]:
# Check Area column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Area', 'string')

replace_value(shark_attacks, 'Area', np.nan, None)



********************************************************************************
Finding non-default missing values for column 'Area'
********************************************************************************
Column 'Area' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 772)
Could not sort values


['California',
 'Georgia',
 'New South Wales',
 'Colima',
 'Pernambuco',
 'Florida',
 'Queensland',
 'South Carolina',
 'Westerm Australia',
 'Eastern Cape Province',
 'Hua Hin',
 'Cocos Island',
 'Western Australia',
 'Alifu Alifu Atoll',
 'Western Cape Province',
 'Hawaii',
 'New Providence',
 nan,
 'Victoria',
 'KwaZulu-Natal',
 'Galapagos Islands',
 'Fernando de Noronha',
 'Sepang',
 'Holquin Province',
 'Pamplemousses ',
 'South Australia',
 'North Island',
 'Canary Islands',
 ' Upolu Island',
 'Shizuoka Prefecture',
 'Texas',
 'Massachusetts',
 'Red Sea Protectorate',
 'New Providence District',
 '40 miles off Grand Bahama Island',
 'Ascension Island',
 'New Jersey',
 'Majorca',
 'Washington',
 'Tabasco',
 'Ibiza Island',
 'Marquesas',
 'South Devon',
 'New Providence ',
 'Sharjah, ',
 'Baja California Sur',
 'Saint-Leu',
 'South Island',
 'Luzon Island',
 'Great Exuma',
 'Saint-Andre',
 'Bimini',
 'Bali',
 'Tuamotos',
 'Oregon',
 'North Province',
 'New Providence Island',
 'Ali



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' Kikori River mouth',
 ' La Libertad',
 ' Lau Province',
 ' Loyalty Islands',
 ' Manila Bay',
 ' New Jersey',
 ' North Carolina',
 ' Nusa Tenggara',
 ' Primorje-Gorski Kotar County',
 ' Split-Dalmatia Count,']

['Western Viscayas',
 'Westmoreland Parish',
 'Woodlark Islands',
 'Worcestershire',
 'Yasawa Islands',
 'Ysabel Island',
 'Zadar County',
 'Zambesi River',
 'Zamboanga del Sur Province',
 'd\x92Étang-Salé']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


Florida                 970
New South Wales         436
NaN                     408
Queensland              286
Hawaii                  263
                       ... 
Santa Catarina State      1
Kagawa Prefecture         1
Brindisi Province         1
Altagracia Province       1
North Carolina            1
Name: Area, Length: 772, dtype: int64





In [13]:
# Check Location column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Location', 'string')

replace_value(shark_attacks, 'Location', np.nan, None)



********************************************************************************
Finding non-default missing values for column 'Location'
********************************************************************************
Column 'Location' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 3791)
Could not sort values


['Oceanside, San Diego County',
 'St. Simon Island, Glynn County',
 'Arrawarra Headland',
 'La Ticla',
 'Flat Rock, Ballina',
 'Piedade Beach, Recife',
 'Lighhouse Point Park, Ponce Inlet, Volusia County',
 'Cocoa Beach, Brevard  County',
 'Daytona Beach, Volusia County',
 'Cairns Aquarium',
 'Isle of Palms,  Charleston County',
 'Hilton Head Island, Beaufort County',
 'Dugong Bay',
 'Nahoon Beach, East London',
 'Sharpes Beach, Ballina',
 'Sai Noi Beach',
 'Manuelita',
 'Gearys Beach',
 'Lennox Head',
 'Surfers Point, Prevelly',
 'South Point, Gracetown',
 'Madoogali',
 'Robberg Beach, Plettenberg Bay',
 'Shipwreck\x92s Beach, Keoneloa Bay, Kauai',
 'Piedade',
 'Lefthanders, Margaret River Area',
 'Cobblestones, Margaret River Area',
 'Nirvana Beach',
 'Magenta Beach, Noumea',
 'Bimini',
 'St. Francis Bay',
 'Kukio Beach',
 "Waterman's Bay",
 'Lorne',
 'Winkipop',
 'Durban',
 'Little Congwong Beach, La Perouse ',
 'Westernport Bay',
 'Surf Beach, Kiama',
 'Santa Fe Island',
 'Cone Bay



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


['            ',
 ' A pearl  farm in Roebuck Bay',
 ' Ambatolaoka, Nosy Be Island',
 ' Bellingen',
 ' Black Head, south of Taree',
 ' Boa Viagem Beach, Recife',
 ' Boca de la Leña, La Unión',
 ' Botany Bay ',
 ' Bunker Bay',
 ' Chennai (formerly Madras)']

['off Dakar',
 'off Neuvitas',
 'off Paoay, Ilocos Norte Province',
 'off yacht Serenade',
 'the pearling beds',
 'uShaka Aquarium, Durban',
 '½ mile offshore & 9 miles north of Fort Pierce',
 'Île Saint-Paul',
 'Île de Casey',
 'Île de Sable']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


NaN                                    487
New Smyrna Beach, Volusia County       160
Daytona Beach, Volusia County           30
Ponce Inlet, Volusia County             20
Melbourne Beach, Brevard County         17
                                      ... 
Bilene Bay, 180 km north of Maputo       1
Gleneden Beach                           1
Île de Sable                             1
l'Anse-Vata                              1
Below the English fort, Trincomalee      1
Name: Location, Length: 3791, dtype: int64





In [14]:
# Check Activity column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Activity', 'string')

replace_value(shark_attacks, 'Activity', np.nan, None)
replace_value(shark_attacks, 'Activity', ' ', None)
replace_value(shark_attacks, 'Activity', '   ', None)
replace_value(shark_attacks, 'Activity', '.', None)



********************************************************************************
Finding non-default missing values for column 'Activity'
********************************************************************************
Column 'Activity' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 1419)
Could not sort values


['Paddling',
 'Standing',
 'Surfing',
 'Free diving',
 'Kite surfing',
 'Swimming',
 'Fishing',
 'Walking',
 'Feeding sharks',
 'Boogie boarding',
 'Scuba diving',
 'Paddle-skiing',
 'Body boarding',
 'Windsurfing',
 'Stand-Up Paddleboarding',
 'Wading',
 'Scuba Diving',
 'Kayak fishing for sharks',
 'Snorkeling',
 nan,
 'Spearfishing',
 'Diving',
 'Fishing / Wading',
 '2 boats capsized',
 'Night bathing',
 'Surfing ',
 'Kayaking / Fishing',
 'Kayaking',
 'Body surfing',
 'Swimming, poaching abalone',
 'Canoeing',
 'SUP',
 'Skimboarding',
 'Touching a shark',
 'Attempting to lasso a shark',
 'Paddle boarding',
 'Kakaying',
 'Washing hands',
 'Grabbing shark for a selfie',
 'Kayak fishing',
 'Tagging sharks',
 'Surf skiing ',
 'Surf fishing',
 'Floating',
 'Surfng',
 'SUP Foil boarding',
 'Lobstering',
 'Fishing in Alabama Deep Fishing Rodeo',
 'Fishing for squid',
 'Fishing for sharks',
 'Floating in tube',
 'Teasing a shark',
 'Surf-skiing',
 'Diving for beche-de-mer',
 'Kayak Fishing



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' ',
 '   ',
 ' a canoe was pursuing a schooner that had forcibily abducted 5 young girls',
 '"Boat accident"',
 '"Climbing up to ship after repairing the stern in water"',
 '"Crossing the river"',
 '"Flying Tiger" transport plane went down with 5 men onboard',
 '"Riding waves on a board"',
 '"Swimming vigorously"',
 '.']

['male',
 'native boats sunk in storm',
 'preparing to go skin diving',
 'pêcheur de bichiques',
 'ship M.V. Rizal sank during typhoon',
 'ship William Penn grounded & broke apart',
 'ship torpedoed 400 miles off the African coas. Man was clinging to hatch cover',
 'wreck of the State Oil Company ship Permina',
 'yachting accident',
 'yachtsman in a zodiac']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


Surfing                                 931
Swimming                                779
NaN                                     422
Fishing                                 411
Spearfishing                            307
                                       ... 
Sight-seeing                              1
Surf-fishing                              1
Freediving for abalone (at surface)       1
Scuba diving for lobsters                 1
Wreck of  large double sailing canoe      1
Name: Activity, Length: 1419, dtype: int64





In [15]:
# Check Sex column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Sex_Victim', 'string')

replace_value(shark_attacks, 'Sex_Victim', np.nan, None)
replace_value(shark_attacks, 'Sex_Victim', '.', None)
replace_value(shark_attacks, 'Sex_Victim', 'M ', 'M')
replace_value(shark_attacks, 'Sex_Victim', 'lli', None)
replace_value(shark_attacks, 'Sex_Victim', 'N', None)



********************************************************************************
Finding non-default missing values for column 'Sex_Victim'
********************************************************************************
Column 'Sex_Victim' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 7)
Could not sort values


['F', 'M', nan, 'M ', 'lli', 'N', '.']



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


['.', 'F', 'M', 'M ', 'N', 'lli']

['.', 'F', 'M', 'M ', 'N', 'lli']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


M      4683
F       575
NaN     491
M         2
N         2
lli       1
.         1
Name: Sex_Victim, dtype: int64





In [16]:
# Check Age column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Age', 'string')

replace_value(shark_attacks, 'Age', np.nan, None)
replace_value(shark_attacks, 'Age', ' ', None)
replace_value(shark_attacks, 'Age', '  ', None)
replace_value(shark_attacks, 'Age', 'F', None)
replace_value(shark_attacks, 'Age', 'MAKE LINE GREEN', None)
replace_value(shark_attacks, 'Age', 'X', None)
replace_value(shark_attacks, 'Age', '\xa0 ', None)
replace_value(shark_attacks, 'Age', 'A.M.', None)



********************************************************************************
Finding non-default missing values for column 'Age'
********************************************************************************
Column 'Age' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 148)
Could not sort values


['57',
 '11',
 nan,
 '18',
 '52',
 '15',
 '12',
 '32',
 '10',
 '34',
 '30',
 '60',
 '33',
 '29',
 '54',
 '41',
 '37',
 '19',
 '25',
 '69',
 '38',
 '55',
 '35',
 '45',
 '40s',
 '28',
 '20',
 '24',
 '26',
 '49',
 '14',
 '22',
 '7',
 '31',
 '17',
 '40',
 '13',
 '42',
 '3',
 '50',
 '46',
 '16',
 '82',
 '48',
 '20s',
 '21',
 '51',
 '39',
 '58',
 'Teen',
 '47',
 '61',
 '65',
 '73',
 '36',
 '66',
 '43',
 '60s',
 '9',
 '72',
 '59',
 '6',
 '64',
 '23',
 '71',
 '44',
 '27',
 '62',
 '68',
 '63',
 '70',
 '18 months',
 '53',
 '30s',
 '50s',
 '8',
 'teen',
 '77',
 '74',
 '56',
 '28 & 26',
 '5',
 '86',
 '18 or 20',
 '12 or 13',
 '46 & 34',
 '28, 23 & 30',
 'Teens',
 '36 & 26',
 '84',
 '\xa0 ',
 ' ',
 '30 or 36',
 '6½',
 '21 & ?',
 '33 or 37',
 'mid-30s',
 ' 30',
 '7      &    31',
 ' 28',
 '20?',
 "60's",
 '32 & 30',
 '87',
 'Elderly',
 '75',
 '74 ',
 '45 ',
 '21 or 26',
 '20 ',
 '>50',
 '18 to 22',
 'adult',
 '9 & 12',
 '9 months',
 '25 to 35',
 '23 & 26',
 '1',
 '(adult)',
 '33 & 37',
 '25 or 28',




----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' ',
 '  ',
 ' 28',
 ' 30',
 ' 43',
 '"middle-age"',
 '"young"',
 '(adult)',
 '1',
 '10']

['F',
 'MAKE LINE GREEN',
 'Teen',
 'Teens',
 'X',
 'adult',
 'mid-30s',
 'teen',
 'young',
 '\xa0 ']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


NaN         2522
18           143
17           140
15           137
16           136
            ... 
45             1
21 or 26       1
20             1
>50            1
13 or 14       1
Name: Age, Length: 148, dtype: int64





In [17]:
# Check Injury column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Injury', 'string')

replace_value(shark_attacks, 'Injury', np.nan, None)



********************************************************************************
Finding non-default missing values for column 'Injury'
********************************************************************************
Column 'Injury' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 3334)
Could not sort values


['No injury to occupant, outrigger canoe and paddle damaged',
 'Minor injury to left thigh',
 'Minor injury to lower leg',
 'Lacerations to leg & hand shark PROVOKED INCIDENT',
 'No injury, board bitten',
 'FATAL',
 'Minor injury to foot. PROVOKED INCIDENT',
 'Lower left leg bitten',
 'Minor injury to foot',
 'Minor bite to hand by captive shark. PROVOKED INCIDENT',
 'Injuries to lower right leg and foot',
 'Severe bite to right forearm',
 'Shallow lacerations to finger PROVOKED INCIDENT',
 'Minor injury, marks on board',
 'No injury, surfboard damaged',
 'Minor injury to ankle',
 'No injury, shark bit scuba gear',
 'No injury, knocked off board by shark',
 'No injury',
 'No injury, shark struck his leg',
 '5-inch cut to hand',
 'No injury, shark bit hole in ski',
 'Lacerations to right lower leg',
 'Lacerations to foot and ankle',
 'Multiple severe injuries to arms and leg, leg subsequently surgically amputated',
 'Laceration to right thigh',
 'Lacerations to legs',
 'No Injury. Shark



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' 2 of the 5 fishermen were so seriously injured they died of their wounds',
 ' 3 fingers & thigh lacerated, foot crushed',
 ' 55  perished, some were  taken by sharks',
 ' 6 lacerations to left hand',
 ' Ankle lacerated',
 " Du Val's leg was bitten but he survived",
 ' FATAL',
 ' FATAL. Shark bite was minor injury, but he suffered a heart attack afterwards and died 6 hours later',
 ' No injury',
 ' No injury. Shark bumped kayak, flinging her into the  water. ']

['Wrist lacerated PROVOKED INCIDENT',
 'bite to lower leg',
 'forearm abraded',
 'minor injury to foot',
 'non-fatal',
 'remains of one of the crew found in shark',
 'sharks rammed boats, no injury to occupants',
 'small laceration to wrist',
 'unknown',
 '\x93Put hand through hatch, shark nearly bit off thumb\x94']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


FATAL                                                                   780
Survived                                                                 94
Foot bitten                                                              87
Leg bitten                                                               70
No injury                                                                60
                                                                       ... 
Left hand & forearm bitten, board bitten                                  1
Legs & torso injured                                                      1
No Injury, ski bitten                                                     1
No injury to occupant: boat lost                                          1
FATAL. "Shark bit him in half, carrying away the lower extremities"       1
Name: Injury, Length: 3334, dtype: int64





In [18]:
# Check Fatal_(Y/N) column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Fatal_(Y/N)', 'string')

replace_value(shark_attacks, 'Fatal_(Y/N)', np.nan, None)
replace_value(shark_attacks, 'Fatal_(Y/N)', 'y', 'Y')
replace_value(shark_attacks, 'Fatal_(Y/N)', ' N', 'N')
replace_value(shark_attacks, 'Fatal_(Y/N)', 'N ', 'N')
replace_value(shark_attacks, 'Fatal_(Y/N)', '2017', None)
replace_value(shark_attacks, 'Fatal_(Y/N)', 'M', None)
replace_value(shark_attacks, 'Fatal_(Y/N)', 'UNKNOWN', None)



********************************************************************************
Finding non-default missing values for column 'Fatal_(Y/N)'
********************************************************************************
Column 'Fatal_(Y/N)' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 9)
Could not sort values


['N', 'Y', nan, 'M', 'UNKNOWN', '2017', ' N', 'N ', 'y']



----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' N', '2017', 'M', 'N', 'N ', 'UNKNOWN', 'Y', 'y']

[' N', '2017', 'M', 'N', 'N ', 'UNKNOWN', 'Y', 'y']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


N          4279
Y          1380
UNKNOWN      70
NaN          15
 N            7
M             1
2017          1
N             1
y             1
Name: Fatal_(Y/N), dtype: int64





In [19]:
# Check Species column for null values and change to appropriate value
find_non_default_missing_values(shark_attacks, 'Species', 'string')

replace_value(shark_attacks, 'Species', np.nan, None)
replace_value(shark_attacks, 'Species', ' ', None)
replace_value(shark_attacks, 'Species', '\xa0 ', None)



********************************************************************************
Finding non-default missing values for column 'Species'
********************************************************************************
Column 'Species' has datatype: object


----------------------------------------
A: Looking at unique values
----------------------------------------
Less than 5755 unique values (total: 1493)
Could not sort values


['White shark',
 nan,
 '2 m shark',
 'Tiger shark, 3m',
 'Tiger shark',
 "Lemon shark, 3'",
 "Bull shark, 6'",
 'Grey reef shark',
 'Tawny nurse shark, 2m',
 'Shark involvement not confirmed',
 'Questionable',
 '3 m shark',
 'White shark, 3.5 m',
 'White shark, 2.5 m',
 "6' shark",
 'Juvenile bull shark',
 'Bull shark',
 "Tiger shark, 12'",
 'Wobbegong shark',
 '3.5 m shark',
 '1.8 m shark',
 'Blacktip shark',
 'Juvenile white shark,  2.7 to 3.2 m',
 'Bull shark, 2 m',
 'Galapagos shark?',
 'Bull shark, 3 m ',
 'Grey reef shark. 2 m',
 'small shark',
 'Wobbegong shark?',
 'Juvenile nurse shark',
 "Nurse shark. 5'",
 'Tiger shark, female',
 'Some drowned but other may have been killed by blue sharks',
 'White shark, 4.6 m',
 'Cookiecutter shark',
 'Wobbegong shark, 1 m',
 'White shark, 4.5 m',
 'Spinner shark, 4 to 5 feet',
 'Tiger shark, 8 to 10 feet',
 "8' shark",
 "5' shark",
 "4' to 5' shark",
 'Porbeagle, 1.5 m',
 'White shark, 3.5m',
 "5' to 6' shark",
 'White shark, 3 to 3.5m ',




----------------------------------------
B: Sorting and looking at the edges
----------------------------------------
Could not sort values: '<' not supported between instances of 'float' and 'str'
..so let's try filtering NULL values and then sorting


[' ',
 ' "gummy" shark (Rhizoprionodon or Loxodon) 1.2 m [4\']',
 " 1.5 m [5'] dusky shark",
 " 6' to 8' shark",
 " Blacktip shark, C. maculipinnis. 1.9 m to 2.1 m [6.5' to 7'] ",
 ' Bull shark, 1.2m ',
 ' Bull shark, 2.5 m',
 " Bull shark, 5'",
 " Bull shark, 8'",
 " Galapagos shark, 6'"]

['small hammerhead shark',
 'small nurse shark',
 'small shark',
 'small sharks',
 "small sharks'",
 'unknown, possibly a white shark',
 'whaler shark',
 'white shark',
 '\x93small brown shark\x94',
 '\xa0 ']



----------------------------------------
C: Casting to type: string
----------------------------------------
Casting to string was successful


----------------------------------------
D: Looking at frequency
----------------------------------------


NaN                                                                                                                              2817
White shark                                                                                                                       161
Tiger shark                                                                                                                        73
Bull shark                                                                                                                         51
6' shark                                                                                                                           40
                                                                                                                                 ... 
White shark, 4.5 m [14'9"]                                                                                                          1
1.8 m to 2.4 m [6' to 8'] "black finned shark"                





# **Find an appropriate value for the shark species**

In [20]:
# read dataset with the scientific and common names of sharks
all_shark_species = pd.read_csv('/content/drive/MyDrive/OefendataWinc/SharkSpecies.csv', encoding='unicode_escape', on_bad_lines='skip')


# Prepare dataset
# Add empty column for the common names
all_shark_species['Shark_common_name'] = ''

# Remove unnecessary rows
remove_rows = ['ORDER', 'Family', 'Genus']

for index, row in all_shark_species.iterrows():
  for item in remove_rows:
    if item in row['Taxonomic listing']:
      all_shark_species.drop([index], axis=0, inplace=True)


# Find common name
for index, row in all_shark_species.iterrows():  
  split_names = row['Taxonomic listing'].split('(')

  # If a shark has a common name, add this name to a new column
  if len(split_names) == 1:
    row['Shark_common_name'] = split_names[0]
  
  # Else use the scientific name
  else:
    split_names[1] = split_names[1].strip(' )')
    row['Shark_common_name'] = split_names[1]
  
  # Replace gray with grey
  row['Shark_common_name'] = row['Shark_common_name'].replace('gray', 'grey')
  row['Shark_common_name'] = row['Shark_common_name'].replace('Gray', 'Grey')


# Transfer the common name to a list with keywords
search_keywords = []
for index, row in all_shark_species.iterrows():  
  split_shark_species = row['Shark_common_name'].split(' ')
  search_keywords.append(split_shark_species)

In [21]:
# Function to find the name of the sharks
def get_shark_name(value, index):
    space_index = value.rfind(' ', 0,index-1)
    if space_index == -1:
        return value[:index+5]
    else:
        return value[space_index+1:index+5]


# Function to clean-up the name of the sharks
def clean_up_shark_species(shark):
  shark = re.sub("\d", "", shark)

  # Replace characters
  char_to_replace = {'"' : '', "'" : '', ' m ' : '', '.' : '', '-lb' : '', 
                     '-gill' : '', 'kg ' : '', '[' : '', ']' : '',
                     '-foot' : '', '>' : '', 'small' : '', 'juvenile' : ''}

  for key, value in char_to_replace.items():
    shark = shark.replace(key, value)

  return shark

In [22]:
# Find the appropriate common name if possible
# Add empty column to the original dataframe for the common name
shark_attacks['Shark'] = ''


# Find a proper common name for every shark if possible
for index, row in shark_attacks.iterrows():
  # If the value for the species is empty, then use None
  if row['Species'] == None:
    row['Shark'] = 'Unknown'


  # If the value for the specisies is not empty, try to find the common name
  elif row['Species'] != None:
    # Get the value and clean it up
    shark_value = row['Species']
    shark_value = clean_up_shark_species(shark_value)

    # Find the index of the word 'shark'
    shark_indexes = [m.start() for m in re.finditer('shark', shark_value)]

    
    # If no 'shark' is detected
    if len(shark_indexes) == 0:
      row['Shark'] = 'Uncertain if it was a shark'

    
    # If column only contains value 'shark'
    elif len(shark_indexes) == 1 and shark_indexes[0] < 2:
      row['Shark'] = 'Shark not identified'

    
    # If 'shark' is detected more than one time
    elif len(shark_indexes) > 1:
      # If 'shark' is the first word
      if shark_indexes[0] == 0:
        row['Shark'] = 'Species uncertain'
      
      # If column contains ' or ', there are multiple options
      elif ' or ' in shark_value:
        row['Shark'] = 'Species uncertain'

    # If 'shark' is detected one time and it is not on index 0
    elif len(shark_indexes) == 1 and shark_indexes[0] != 0:
      # If column contains a question mark
      if '?' in shark_value:
        row['Shark'] = 'Species uncertain'
      
      # Else try to find the common name by comparing the keywords obtained 
      # from the dataframe with the shark species names
      else:
        for keyword_list in search_keywords:
          index = search_keywords.index(keyword_list)
          if all(word in row['Species'] for word in keyword_list):
            row['Shark'] = ' '.join(keyword_list)


# For all the values in the 'Shark' column that are still empty, try to find 
# common name based on the value in the 'Species' column
empty_values = shark_attacks[shark_attacks['Shark'] == ''].index
if len(empty_values) != 0:
  for i in empty_values:
    empty_species = shark_attacks['Species'].loc[i]
    empty_species = clean_up_shark_species(empty_species)

    shark_index = [m.start() for m in re.finditer('shark', empty_species)]
    new_name = get_shark_name(empty_species, shark_index[0])
    new_name = new_name.strip().capitalize()
    shark_attacks['Shark'].loc[i] = new_name


# Detect the rows where the 'Shark' column only contains the value shark and 
#replace this value
for index, row in shark_attacks.iterrows():
  if row['Shark'] == 'Shark':
    row['Shark'] = 'Shark not identified'

# **Add an age category**

In [23]:
# Determine age category (Child, Adult, Child or Adult, of both)
# Add empty column to dataframe
shark_attacks['Age_Category'] = ''

# Change values in 'Age' column to lower case
shark_attacks['Age'] = shark_attacks['Age'].str.lower()
# Remove leading and trailing spaces
shark_attacks['Age'] = shark_attacks['Age'].str.strip()
# Replace unwanted characters
char_to_replace = {'"' : '', "'" : '', 's' : '', '(' : '', ')' : '', 
                    '>' : '', 'mid-' : '', '1/2' : ''}
# Set requirements for child category
child = ['teen', 'young', 'month']


for index, row in shark_attacks.iterrows():
  # If 'Age' is None, then set 'Age_Category' to None
  if row['Age'] == None:
    row['Age_Category'] = 'Unknown'


  # If 'Age' contains an element from child, then set 'Age_Category' to 'Child'
  elif any(value in row['Age'] for value in child):
    row['Age_Category'] = 'Child'

  
  elif row['Age'] != None:
    for key, value in char_to_replace.items():
      row['Age'] = row['Age'].replace(key, value)


    # Split values in 'Age' based on space
    split_ages = row['Age'].split(' ')
    # Get the integers
    ages = [int(age) for age in split_ages if age.isdigit()]


    # If 'Age' contains age of multiple victims
    if '&' in split_ages:
      if all(number < 18 for number in ages):
        #print(f'Multiple ages below 18 {index}')
        row['Age_Category'] = 'Child'

      elif all(number >= 18 for number in ages):
        #print(f'Multiple ages equal to or above 18 {index}')
        row['Age_Category'] = 'Adult'

      else:
        #print(f'Child and adult {index}')
        row['Age_Category'] = 'Child and Adult'


    # If it's not sure what the age of the victim is
    elif 'or' in split_ages:
      if all(number < 18 for number in ages):
        #print(f'All ages below 18 {index}')
        row['Age_Category'] = 'Child'

      elif all(number >= 18 for number in ages):
        #print(f'All ages equal to or above 18 {index}')
        row['Age_Category'] = 'Adult'

      else:
        #print(f'Child or aduls {index}')
        row['Age_Category'] = 'Child or Adult'

    # If 'Age' is just one number
    else:
      for age in ages:
        if age < 18:
          row['Age_Category'] = 'Child'

        else:
          row['Age_Category'] = 'Adult'

# **Question 1: What are the most dangerous types of sharks to humans?**

**Assumptions during data preparation**
<br/>
There was no uniform way that the shark species were recorded in the data set. This made it more difficult the analyze the attacks based on the species that were involved. To create a more uniform way, the following assumptions were made:
- When no shark species was recorded, the case was categorized as 'Unknown'.
- When there was no mentioning of the word ‘shark’ in the description, then it is assumed that it was not certain a shark was involved in the accident. These cases were categorized as: ‘Uncertain if it was a shark’. In some cases, the shark name was not fully mentioned, but was abbreviated by removing the word 'shark'. Because the description didn't contain the word 'shark' it was categorized as if 'Uncertain if it was a shark'.
- When only the word ‘shark’ was used to describe the species, then it is assumed that the shark species was not identified. These cases were categorized as: ‘Shark not identified’.
- When the word ‘shark’ was mentioned multiple times, when the name of the shark was followed by a question mark or when the word ‘or’ was used in the description, then it is assumed that it was uncertain which species was involved. These cases were categorized as: ‘Species uncertain’.
- In all other cases, an attempt was made to find the appropriate common name of the shark species.
<br/><br/>

**Assumptions during analysis**
<br/>
There are multiple ways to interpret what the most dangerous types of sharks are to humans. What I considered the most dangerous were the type of sharks that were involved in attacks that had a fatal outcome. 
<br/><br/>

**Answer**
<br/>
The top five most dangerous types of sharks to humans according to my analysis are:

|Number|Shark species|Number of fatal attacks|
|------|-------------|-----------------------|
|1     |White shark  |896                    |
|2     |Tiger shark  |243                    |
|3     |Bull shark   |168                    |
|4     |Blacktip shark|64                    |
|5     |Whaler shark |61                     |

In [None]:
# 1. What are the most dangerous types of sharks to humans?

shark_attacks.loc[(shark_attacks['Fatal_(Y/N)'] == 'Y')]

shark_attacks['Shark'].value_counts().head(10)

Shark not identified           896
White shark                    617
Tiger shark                    243
Bull shark                     168
Uncertain if it was a shark     76
Blacktip shark                  64
Species uncertain               62
Whaler shark                    61
Nurse shark                     56
Mako shark                      46
Name: Shark, dtype: int64

# **Question 2: Are children more likely to be attacked by sharks?**

**Assumptions during data preparation**
<br/>
A new column was added to the dataframe to assign an age category to every row where the age of the victim was known. This categorization works as follows:
- 'Unknown'
  - When the age of the victim was unknown.
- 'Child'
  - When the age was in months, or when the description ‘teen’ or ‘young’ was used.
  - When the age of the victim was below 18.
  - When multiple victims were involved in the case and all of them were below 18.
  - When the age of the victim was uncertain, but both suggestions were below 18.
- 'Adult'
  - When the age of the victim was 18 or above.
  - When multiple victims were involved in the case and all of them were 18 or above.
  - When the age of the victim was uncertain, but both suggestions were 18 or above.
- 'Child or adult'
  - When the age of the victim was uncertain and one suggestion was below 18 and the other was 18 or above.
- Child and adult'
  - When multiple victims were involved in the case and some of them were below 18 and others 18 or above.

Seven cases didn't get a proper age category assigned. Because this number was so low, these cases were excluded from the analysis.
<br/><br/>

**Answer**
<br/>
Based on the cases where the age of the victim was known, children are not more likely to be attacked by sharks.
<br/>
According to my analysis, there were 2332 cases that involved adults and 883 cases that involved children. In 2 cases it involved a child and an adult, and in 1 case it wasn’t sure if it was a child or an adult.

In [None]:
# 2. Are children more likely to be attacked by sharks?
shark_attacks['Age_Category'].value_counts()

Unknown            2530
Adult              2332
Child               883
                      7
Child and Adult       2
Child or Adult        1
Name: Age_Category, dtype: int64

# **Question 3: Are shark attacks where sharks were provoked more or less dangerous?**

**Assumptions during data analysis**
<br/>
The cases were no type was assigned were excluded from the analysis. This was the case for 4 cases. The cases were the type 'Boat' or 'Boating' was used, were also excluded (341). It was uncertain if these were considered provoked or unprovoked. The amount of cases used for the analysis were 5169. 
To determine whether provoked or unprovoked attacks were more dangerous, the amount of cases that had a fatal outcome were compared.
<br/><br/>

**Answer**
<br/>
There were 574 cases reported that involved a provoked shark attack. A total of 19 cases had a fatal outcome (3.3%). Of the 4595 cases that were considered unprovoked, a total of 1181 cases had a fatal outcome (25.7%). Based on these numbers, it can be concluded that provoked attacks are not more dangerous than unprovoked attacks. These are, in fact, less dangerous.

In [None]:
# 3. Are shark attacks where sharks were provoked more or less dangerous?
print(shark_attacks.loc[(shark_attacks['Type'] == 'Boat') | (shark_attacks['Type'] == 'Boating') | (shark_attacks['Type'] == 'Boatomg')].shape[0])

provoked = shark_attacks.loc[(shark_attacks['Type'] == 'Provoked')].shape[0]
unprovoked = shark_attacks.loc[(shark_attacks['Type'] == 'Unprovoked')].shape[0]

print(f'Total amount of cases for analysis: {provoked + unprovoked}')

provoked_and_fatal = shark_attacks.loc[(shark_attacks['Type'] == 'Provoked') & (shark_attacks['Fatal_(Y/N)'] == 'Y')].shape[0]
unprovoked_and_fatal = shark_attacks.loc[(shark_attacks['Type'] == 'Unprovoked') & (shark_attacks['Fatal_(Y/N)'] == 'Y')].shape[0]

print(f'Amount of provoked attacks that were fatal: {provoked_and_fatal}. This is: {(provoked_and_fatal / provoked) * 100}%')
print(f'Amount of unprovoked attacks that were fatal: {unprovoked_and_fatal}. This is: {(unprovoked_and_fatal / unprovoked) * 100}%')

Total amount of cases for analysis: 5169
Amount of provoked attacks that were fatal: 19. This is: 3.3101045296167246%
Amount of unprovoked attacks that were fatal: 1181. This is: 25.701849836779108%


# **Question 4: Are certain activities more likely to result in a shark attack?**

**Assumptions during analysis**
<br/>
There are 1415 types of activities mentioned in the dataframe. Some are the same, but are formulated differently. No adjustments were made to the activities and were kept as is. The names of the top nine activities were used to search every row in the dataframe if one of those names was present in the description. This way, the top five activities also include the cases that used a different discription for the activity but still contained the name of the specific activity in that description. All activities that included diving (for example free diving and scuba diving) were combined into the category named 'Diving'. Some cases included multiple activities. 
<br/><br/>

**Answer**
<br/>
There are certain activities that are more likely to result in a shark attack than others. The top five activities are:

|Number|Activity     |Number of attacks|
|------|-------------|-----------------|
|1     |Surfing      |1211             |
|2     |Swimming     |1016             |
|3     |Fishing      |699              |
|4     |Diving       |516              |
|5     |Spearfishing |397              |


In [None]:
# 4. Are certain activities more likely to result in a shark attack?
print(shark_attacks['Activity'].value_counts().head(10))

shark_attacks['Activity'] = shark_attacks['Activity'].str.lower()
surfing = 0
swimming = 0
fishing = 0
spearfishing = 0
bathing = 0
wading = 0
diving = 0
standing = 0
snorkeling = 0

for index, row in shark_attacks.iterrows():
  if row['Activity'] != None:
    if 'surf' in row['Activity']:
      surfing += 1
  
    if 'swim' in row['Activity']:
      swimming += 1
  
    if 'spearfishing' in row['Activity']:
      spearfishing += 1
  
    if 'bathing' in row['Activity']:
      bathing += 1
  
    if 'fishing' in row['Activity'] and 'spearfishing' not in row['Activity']:
      fishing += 1
    
    if 'wading' in row['Activity']:
      wading += 1
    
    if 'diving' in row['Activity']:
      diving += 1
    
    if 'standing' in row['Activity']:
      standing += 1

    if 'snorkeling' in row['Activity']:
      snorkeling += 1


print(f'Amount of cases that involved surfing: {surfing}')
print(f'Amount of cases that involved swimming: {swimming}')
print(f'Amount of cases that involved fishing: {fishing}')
print(f'Amount of cases that involved diving: {diving}')
print(f'Amount of cases that involved spearfishing: {spearfishing}')
print(f'Amount of cases that involved bathing: {bathing}')
print(f'Amount of cases that involved wading: {wading}')
print(f'Amount of cases that involved standing: {standing}')

surfing         931
swimming        779
fishing         411
spearfishing    307
bathing         150
wading          140
diving          115
standing         98
snorkeling       87
scuba diving     71
Name: Activity, dtype: int64
spearfishing, dived to pick up a float line                                                                                                             1
pulling shark from the water                                                                                                                            1
standing on sandbar                                                                                                                                     1
sinking of the m/v mindoro during a typhoon                                                                                                             1
swimming after his canoe capsized                                                                                                                       1
p