In [293]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher

In [294]:
pd.set_option('display.max_rows', 150, 'display.min_rows',100)

In [295]:
df_sharks = pd.read_csv('GSAF5.csv', encoding='latin-1')

In [296]:
df_sharks.shape

(5992, 24)

In [297]:
df_sharks[df_sharks.duplicated(keep=False)] #No true duplicates

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [298]:
df_sharks.rename(columns={'Case Number':'case_number'},inplace=True) 

In [299]:
df_sharks.drop(labels=['Unnamed: 22','Unnamed: 23'], axis = 1, inplace=True)

In [300]:
for column in ['case_number','Case Number.1','Case Number.2']:
    df_sharks[column] = df_sharks.loc[:,column].str.replace(r'[\W]', '.',regex=True)

In [301]:
cases = df_sharks[(df_sharks['case_number'] != df_sharks['Case Number.1']) | (df_sharks['case_number']!=df_sharks['Case Number.2'])]

In [302]:
cases[['case_number','Case Number.1','Case Number.2']] # Not many differences, Case Number.1,2 most likely irrelevant

Unnamed: 0,case_number,Case Number.1,Case Number.2
4,2016.09.15,2016.09.16,2016.09.15
33,2016.07.14.4,2016.07.14.R,2016.07.14.4
97,2016.01.24.b,2015.01.24.b,2016.01.24.b
116,2015.12.23,2015.11.07,2015.12.23
121,2015.10.28.a,2015.10.28,2015.10.28.a
3654,1961.09.02.R,1961.09.06.R,1961.09.02.R
4177,1952.08.05,1952.08.04,1952.08.05
4719,1934.01.08.R,1934.02.08.R,1934.02.08.R
5043,1900.00.00.R,1919.00.00.R,1900.00.00.R
5150,1911.07.31.R,1911.07.31.T,1911.07.31.R


In [303]:
df_sharks.drop(labels=['Case Number.1','Case Number.2'], axis = 1, inplace=True)

In [304]:
df_sharks['Type'].value_counts()

Unprovoked      4386
Provoked         557
Invalid          519
Sea Disaster     220
Boat             200
Boating          110
Name: Type, dtype: int64

In [305]:
df_sharks['Type'].replace({'Invalid':'Unknown','Boat':'Boating'}, inplace=True) # Correct and unify

In [306]:
df_sharks['Type'].value_counts()

Unprovoked      4386
Provoked         557
Unknown          519
Boating          310
Sea Disaster     220
Name: Type, dtype: int64

In [307]:
df_sharks['Country'] = df_sharks['Country'].str.upper().replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes
df_sharks.loc[5991, 'Country'] = 'SRI LANKA' # Fixed minor mistake

In [308]:
df_sharks['Area'] = df_sharks['Area'].str.replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes

In [309]:
df_sharks.loc[df_sharks['Country'].isnull(), 'Country'] = 'IW or Unknown' #International Waters or unknown

In [310]:
df_sharks['Location'] = df_sharks['Location'].str.replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes
df_sharks[df_sharks['Location'].isnull()]

Unnamed: 0,case_number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,original order
21,2016.07.28.R,28-Jul-16,2016,Unprovoked,CHINA,Hong Kong,,Swimming,Justus Franz,M,72,Lacerations to leg,N,,,"Klassick, 7/28/2016",2016.07.28.R-Franz.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5972
42,2016.06.24,24-Jun-16,2016,Unprovoked,COLUMBIA,Isla Provedencia,,Scuba Diving,Arturo Velez,M,59,Severe bite to right hand,N,11h00,"Caribbean reef shark, 4.5'",Dr. A. Velez,2016.06.24-Velez.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5951
73,2016.04.08,08-Apr-16,2016,Unknown,CAPE VERDE,Boa Vista Island,,,a British citizen,M,60,"""Serious""",N,,Shark involvement not confirmed,L.O.Guttke,2016.04.08-CapeVerde.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5920
80,2016.03.26,26-Mar-16,2016,Provoked,BAHAMAS,,,,Henry Kreckman,M,9,Minor injury to chest PROVOKED INCIDENT,N,,"Nurse shark, 2.5-ft","Wisconsin State Journal, 4/2/2016",2016.03.26-Kreckman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5913
83,2016.03.10,10-Mar-16,2016,Unprovoked,FIJI,Vanua Levu,,Diving for beche-de-mer,Maika Tabua,M,45,FATAL,Y,Afternoon,,"Fiji Sun, 3/12/2016",2016.03.10-Tabua.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5910
94,2016.01.29,29-Jan-16,2016,Boating,SOUTH AFRICA,KwaZulu-Natal,,Kayak fishing,Dev De Lange,M,,"No injury, shark capsized kayak",N,,,"Nine News, 2/1/2016",2016.01.29-DeLange.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5899
151,2015.09.00,Sep-15,2015,Unprovoked,FIJI,,,Spearfishing,Viliame Lautiki,M,,Leg bitten,N,,Tiger shark,"Fiji Times, 2/8/2016",2015.09.00-Lautiki.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5841
171,2015.07.06,06-Jul-15,2015,Unknown,FRENCH POLYNESIA,Bora Bora,,Swimming,Joe Termini,M,,Parallel lacerations to torso inconsistent wit...,N,,No shark involvement,"Hollywood Life, 7/6/2015",2015.07.06-Termini.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5821
217,2015.03.29,29-Mar-15,2015,Unknown,ITALY,Sardinia,,Diving,Eugenio Masala,M,43,"FATAL, but shark involvement prior to death un...",Y,,Shark involvement not cofirmed,"A. de Maddalena, GSAF",2015.03.29-Masala.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5775
225,2015.02.15,15-Feb-15,2015,Boating,ATLANTIC OCEAN,,,Transatlantic Rowing,"Avalon, a carbon kevlar monohull: 8 occupants",,,"No injury, shark bit rudder",N,,White shark,"Yorkshire Post, 3/16/2014",2015.02.15-Avalon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5767


In [311]:
df_sharks.loc[df_sharks['Area'].isnull(), 'Area'] = 'Unspecified Area' #Replacing Nulls

In [312]:
df_sharks.loc[df_sharks['Location'].isnull(), 'Location'] = 'Unspecified Location' #Replacing Nulls

In [313]:
df_sharks['Activity'] = df_sharks['Activity'].str.strip().str.lower() # minor cleaning
df_sharks.loc[df_sharks['Activity'].isnull(), 'Activity'] = 'unspecified activity' #Replacing Nulls

df_sharks.loc[df_sharks['Activity'].str.contains('fish'), 'Activity'] = 'fishing'
df_sharks.loc[df_sharks['Activity'].str.contains('divi'), 'Activity'] = 'diving'
df_sharks.loc[df_sharks['Activity'].str.contains('dive'), 'Activity'] = 'diving'
df_sharks.loc[df_sharks['Activity'].str.contains('swim'), 'Activity'] = 'swimming'
df_sharks.loc[df_sharks['Activity'].str.contains('surf'), 'Activity'] = 'surfing'
df_sharks.loc[df_sharks['Activity'].str.contains('unkn'), 'Activity'] = 'unspecified activity'
df_sharks.loc[df_sharks['Activity'].str.contains('boarding'), 'Activity'] = 'boarding'
df_sharks.loc[df_sharks['Activity'].str.contains('capsi'), 'Activity'] = 'boat capsized'
df_sharks.loc[df_sharks['Activity'].str.contains('saili'), 'Activity'] = 'boating'
df_sharks.loc[df_sharks['Activity'].str.contains('bath'), 'Activity'] = 'bathing'
df_sharks.loc[df_sharks['Activity'].str.contains('overboard'), 'Activity'] = 'overboard'
df_sharks.loc[df_sharks['Activity'].str.contains('shark'), 'Activity'] = 'shark interaction'   # A person deliberately interactred with a shark
df_sharks.loc[df_sharks['Activity'].str.contains('boat'), 'Activity'] = 'boat accident'
df_sharks.loc[df_sharks['Activity'].str.contains('wreck'), 'Activity'] = 'boat accident'
df_sharks.loc[df_sharks['Activity'].str.contains('net'), 'Activity'] = 'fishing'
df_sharks.loc[df_sharks['Activity'].str.contains('float'), 'Activity'] = 'floating'

df_sharks['Activity'].value_counts()

fishing                                                                                                                                         1199
surfing                                                                                                                                         1120
swimming                                                                                                                                        1050
unspecified activity                                                                                                                             532
diving                                                                                                                                           524
bathing                                                                                                                                          179
boat accident                                                                                             

In [314]:
# Used
substring_counts={}
names = list(df_sharks['Activity'].value_counts().index)
for i in range(0, len(names)):
    for j in range(i+1,len(names)):
        string1 = names[i]
        string2 = names[j]
        match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
        matching_substring=string1[match.a:match.a+match.size]
        if(matching_substring not in substring_counts):
            substring_counts[matching_substring]=1
        else:
            substring_counts[matching_substring]+=1

print(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}

{'ing': 8478, 'fi': 27, 'hing': 82, 'i': 222, 'sh': 187, '': 838, 'is': 79, 'in': 3143, 'shing': 31, 's': 624, 'shi': 239, 'ish': 8, 'f': 123, 'h': 88, 'hi': 133, 'ng': 140, 'hin': 5, 'r': 306, 'ur': 135, 'su': 90, 'sur': 11, 'u': 180, 'rf': 1, 'm': 173, 'mming': 1, 'im': 6, 'wi': 28, 'sw': 7, 'w': 36, 'ming': 13, 'n': 243, 'imm': 2, 'min': 12, 'ivi': 2, ' ac': 6, 'acti': 2, 'e': 296, 'ti': 153, 'sp': 5, 'ed ': 887, 'if': 61, 'pe': 48, 'un': 72, ' a': 334, 'ci': 24, 'ie': 13, 'cti': 29, 'nsp': 5, 'ns': 10, 'fie': 1, 'ed a': 9, 'd ': 77, 'ec': 11, 'ied ': 2, 'cifi': 2, 'it': 84, 'ac': 68, 'iv': 11, 'ed': 215, 'ty': 2, 'act': 1, 'ifi': 3, 'uns': 1, 'spe': 3, 'ified ': 1, 'd a': 6, 'speci': 2, 'vi': 14, 'd': 161, 'di': 215, 'ving': 5, 'vin': 4, 'at': 298, 'b': 44, 'a': 1011, 'ba': 34, 'th': 163, 'ath': 7, 'bat': 3, 'thing': 1, 't': 221, 'boa': 21, 'o': 207, 'oat': 1, 'de': 294, 'c': 156, 'bo': 36, 't ': 222, 't a': 21, 'ent': 40, 'cide': 4, 'den': 6, 'at ': 16, 'ide': 33, ' ': 64, 'en': 3

In [315]:
{k: v for k, v in sorted(substring_counts.items(), key=lambda item: item[1],reverse=True)}

{'ing ': 11749,
 'ing': 8478,
 'in': 3143,
 ' the ': 2834,
 'a': 1011,
 'ed ': 887,
 '': 838,
 'the ': 814,
 'er': 798,
 ' water': 728,
 'ter': 723,
 ' in ': 713,
 'er ': 634,
 's': 624,
 'an': 558,
 ' in': 524,
 ' s': 508,
 'ding': 502,
 ' wa': 493,
 'e ': 429,
 'n ': 402,
 ' on ': 400,
 'ting ': 383,
 'ar': 342,
 ' a': 334,
 'ra': 334,
 ' o': 323,
 ' of ': 318,
 'ri': 317,
 'the': 315,
 'r': 306,
 'ter ': 305,
 'ing o': 300,
 'at': 298,
 'e': 296,
 'wa': 296,
 'de': 294,
 ' a ': 284,
 'on ': 281,
 'ding ': 245,
 'n': 243,
 'shi': 239,
 'he': 232,
 'i': 222,
 't ': 222,
 't': 221,
 'ing in ': 220,
 'ed': 215,
 'di': 215,
 'water': 215,
 ' with ': 211,
 'o': 207,
 'ad': 194,
 'te': 191,
 'ro': 189,
 'ship ': 188,
 'sh': 187,
 'u': 180,
 'm': 173,
 'ing a': 173,
 ' from ': 171,
 'and': 164,
 'th': 163,
 'd': 161,
 'e s': 158,
 'c': 156,
 'p': 156,
 'ti': 153,
 'a ': 153,
 'li': 151,
 'st': 149,
 ' d': 149,
 'ing t': 147,
 'ng ': 146,
 'to ': 145,
 're': 142,
 'ng': 140,
 'll': 138,
 'ch

In [316]:
df_sharks.loc[df_sharks['Name'].isnull(), 'Name'] = 'Unknown' #Replacing Nulls
df_sharks['Name'] = df_sharks['Name'].str.strip()

In [317]:
indexes_lower = df_sharks['Name'].value_counts()[df_sharks['Name'].value_counts().index.str.islower()].index
list(indexes_lower)
df_sharks.loc[df_sharks['Name'].isin(indexes_lower), 'Name'] = 'Unknown'
df_sharks.loc[df_sharks['Name'].str.contains('Anonymous|Unidentified|Arab boy'), 'Name'] = 'Unknown'

In [318]:
comma_name = df_sharks.loc[df_sharks['Name'].str.contains(r'^[A-Z].*?,\ '), 'Name'].str.extract(r'(^[A-Z].*?),\ ')[0]   #Cleans format: Name Surname, description
df_sharks['Name'].update(comma_name)

In [319]:
occupant = df_sharks.loc[:,'Name'].str.extract(r'ccupan.*?([A-Z].*)')[0] #Cleans format: occupants: Name, (maybe more stuff)
df_sharks['Name'].update(occupant)

In [320]:
parenth = df_sharks.loc[:,'Name'].str.extract(r'([A-Z].*?)\ \((?:fe)?male')[0] #Cleans format: occupants: Name (male/female)
df_sharks['Name'].update(parenth)

In [321]:
df_sharks.loc[df_sharks['Name'].str.contains('\d+\'|boat|yacht'), 'Name'] = 'Boat'

In [322]:
df_sharks.loc[df_sharks['Name'].str.contains('male|diver|occup|^A |boy|girl|^a |^an |sailor|^\d'), 'Name'] = 'Unknown Name' # After the other stuff is clean, we can make several key words unknown

In [323]:
df_sharks.loc[df_sharks['Name'].str.contains(r'boat'), 'Name']

Series([], Name: Name, dtype: object)

In [324]:
df_sharks.rename(columns={'Sex ':'Sex'}, inplace=True)  #Cleaning column name
df_sharks['Sex'].value_counts()

M      4835
F       585
M         2
lli       1
N         1
.         1
Name: Sex, dtype: int64

In [325]:
df_sharks.loc[df_sharks['Sex'].isnull(), 'Sex'] = 'Unknown Sex' #Replacing Nulls
df_sharks.loc[df_sharks['Sex'].str.contains('N'), 'Sex'] = 'M' # Assuming N was intended to be M
df_sharks.loc[df_sharks['Sex'].str.contains('lli|\.'), 'Sex'] = 'Unknown Sex' # Eliminating mistakes

In [326]:
df_sharks['Sex'] = df_sharks['Sex'].str.strip()
df_sharks['Sex'].value_counts() # Clean

M              4838
F               585
Unknown Sex     569
Name: Sex, dtype: int64

In [327]:
df_sharks['Age'] = df_sharks['Age'].astype(str)

In [328]:
df_sharks.loc[df_sharks['Age'].isnull(), 'Age'] = 'Unknown Age' #Replacing Nulls

In [329]:
df_sharks.loc[df_sharks['Age'].str.contains('months'), 'Age'] = '1'
df_sharks.loc[df_sharks['Age'].str.contains('een'), 'Age'] = '15'
df_sharks.loc[df_sharks['Age'].str.contains('^$'), 'Age'] = 'Unknown Age'
df_sharks.loc[df_sharks['Age'].str.contains(r',|&'), 'Age'] = 'Unknown Age'
df_sharks.loc[df_sharks['Age'].str.contains(r'[a-zA-Z]', regex=True), 'Age'] = 'Unknown Age'
df_sharks['Age'] = df_sharks['Age'].str.rstrip('s')
df_sharks['Age'] = df_sharks['Age'].str.rstrip('\'')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('mid-')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('Ca. ')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('>')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('Both ')
df_sharks['Age'] = df_sharks['Age'].str.rstrip('½')
df_sharks['Age'] = df_sharks['Age'].str.strip('?')
df_sharks['Age'] = df_sharks['Age'].str.strip()

In [330]:
or_clean = df_sharks.loc[:,'Age'].str.extract(r'(\d+)\ or')[0] #Cleans format: age or age and keeps the first age
df_sharks['Age'].update(or_clean)

In [331]:
to_clean = df_sharks.loc[:,'Age'].str.extract(r'(\d+)\ to')[0] #Cleans format: age to age and keeps the first age
df_sharks['Age'].update(to_clean)

In [332]:
df_sharks.loc[df_sharks['Injury'].isnull(), 'Injury'] = 'No details' #Replacing Nulls

In [359]:
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(?:Presumed\ )?(?:FATAL),?\ (.*)')[0] #Cleans format: FATAL 
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(.*?),?\ FATAL')[0]
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(FATAL)(\.\ .*)?')[0] 
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(.*?)PROVOKED')[0] 
df_sharks['Injury'].update(fatal)

In [340]:
df_sharks.loc[df_sharks['Injury'].str.contains('^$'), 'Injury'] = 'No details'
df_sharks.loc[df_sharks['Injury'].str.contains(r'No injury|No Injury|no injury|no Injury', regex=True), 'Injury'] = 'No injury'
df_sharks.loc[df_sharks['Injury'].str.contains(r'FATAL|Survived', regex=True), 'Injury'] = 'No details'
df_sharks['Injury'] = df_sharks['Injury'].str.strip()

In [347]:
df_sharks.rename(columns={'Fatal (Y/N)':'Fatal'}, inplace=True)  #Cleaning column name
df_sharks['Fatal'].value_counts()

N          4325
Y          1552
UNKNOWN     115
Name: Fatal, dtype: int64

In [346]:
df_sharks.loc[df_sharks['Fatal'].isnull(), 'Fatal'] = 'UNKNOWN' #Replacing Nulls
df_sharks['Fatal'] = df_sharks['Fatal'].str.strip()
df_sharks.loc[df_sharks['Fatal'].str.contains(r'n', regex=True), 'Fatal'] = 'N'
df_sharks.loc[df_sharks['Fatal'].str.contains(r'F|#VALUE!', regex=True), 'Fatal'] = 'UNKNOWN'

In [379]:
df_sharks['Time'].value_counts()

Unknown Time              3213
Afternoon                  174
11h00                      126
12h00                      110
Morning                    107
15h00                      102
14h00                       95
16h00                       94
16h30                       73
13h00                       72
14h30                       72
17h30                       70
17h00                       68
15h30                       62
18h00                       62
11h30                       61
Night                       58
13h30                       56
10h00                       56
09h00                       49
10h30                       47
Evening                     33
09h30                       33
12h30                       33
Late afternoon              32
07h30                       31
08h00                       30
18h30                       29
19h00                       25
08h30                       25
07h00                       22
17h15                       18
15h45   

In [378]:
df_sharks.loc[df_sharks['Time'].isnull(), 'Time'] = 'Unknown Time' #Replacing Nulls
df_sharks.loc[df_sharks['Time'].str.contains('8:04'), 'Time'] = '20h04'
df_sharks.loc[df_sharks['Time'].str.contains('15j45'), 'Time'] = '15h45'
df_sharks.loc[df_sharks['Time'].str.contains('06j00'), 'Time'] = '06h00'

In [361]:
hours = df_sharks.loc[:,'Time'].str.extract(r'(\d{2}h\d{2})')[0] #Cleans format: FATAL 
df_sharks['Time'].update(hours)

In [380]:
df_sharks.loc[df_sharks['Time'].str.contains('[^h]'), 'Time'].value_counts()

Unknown Time              3213
Afternoon                  174
11h00                      126
12h00                      110
Morning                    107
15h00                      102
14h00                       95
16h00                       94
16h30                       73
13h00                       72
14h30                       72
17h30                       70
17h00                       68
15h30                       62
18h00                       62
11h30                       61
Night                       58
13h30                       56
10h00                       56
09h00                       49
10h30                       47
Evening                     33
09h30                       33
12h30                       33
Late afternoon              32
07h30                       31
08h00                       30
18h30                       29
19h00                       25
08h30                       25
07h00                       22
17h15                       18
15h45   

In [339]:
df_sharks.sample(50)

Unnamed: 0,case_number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,original order
4796,1931.01.14,14-Jan-31,1931,Unprovoked,AUSTRALIA,Torres Strait,Unspecified Location,diving,Albertus,M,Unknown Age,"Thigh, kneecap & lower leg badly lacerated",N,,,"V.M. Coppleson (1958), p.242",1931.01.14-Albertus.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1197
494,2012.12.05,05-Dec-12,2012,Unprovoked,USA,Hawaii,Kauai,surfing,"""Lorrin""",M,60,Lacerations to left foot,N,13h20,10' shark,"Hawaii News Now, 11/5/2012",2012.12.05-Kauai.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5499
1056,2008.04.03,03-Apr-08,2008,Unprovoked,USA,Florida,"South of Ponce de Leon Jetty, New Smyrna Beach...",surfing,Joey Giangrasso,M,18,Right foot & ankle bitten,N,12h00,,"S. Petersohn, GSAF",2008.04.03-Giangrasso.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4937
727,2011.01.31,31-Jan-11,2011,Unprovoked,MEXICO,Quintana Roo,Cancun,swimming,Nicole Moore,F,38,"Leg, forearm & hand severely bitten",N,12h00,6' shark,"El Diario de Yucatan, 2/1/2011",2011.01.31-Moore.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5266
5164,1910.06.25.R,Reported 25-Jun-1910,1910,Unprovoked,MEXICO,Unspecified Area,LaBarra,swimming,H. Gebler,M,Unknown Age,Leg bitten,N,,,"Indiana Evening Gazette, 9/25/1910",1910.06.25.R-Gebler.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,829
5395,1895.06.03.R,Reported 03-Jun-1895,1895,Unprovoked,AUSTRALIA,Western Australia,Barrow Passage,diving,Unknown Name,M,Unknown Age,FATAL,Y,,,"The West Australian, 6/3/1895",1895.06.03.R-JapaneseDiver.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,598
2628,1985.07.00,Mid Jul-1985 or mid Jul-1986,1985,Unprovoked,ITALY,Sicily,Punta Secca,snorkeling,Neil Montoya,M,13,Contusion of left foot,N,16h00,"3 m to 3.6 m [10' to 11'9""] white shark","A. De Maddalena; De Maddalena (2001), N. Monto...",1985.07.00-Montoya.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3366
471,2013.03.16.b,16-Mar-13,2013,Provoked,SOUTH AFRICA,Western Cape Province,De Mond,fishing,Kobus Koeberg,M,30,Lacerations to left calf and heel from hooked ...,N,09h00,"Raggedtooth shark, 1.5 m",National Sea Rescue Institute,2013.03.16.b-Koeberg.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5522
2868,1979.08.03,03-Aug-79,1979,Unprovoked,THAILAND,Southern Thailand,Unspecified Location,murdered by thai pirates,Unknown,M,Unknown Age,FATAL,Y,,,"The Canberra Times, 8/3/1979",1979.08.03.R-Pirates.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,3125
1914,1999.01.13,13-Jan-1999,1999,Unprovoked,SOUTH AFRICA,Eastern Cape Province,Bonza Beach,paddle skiing,Evan Ridge,M,Unknown Age,No injury,N,,"White shark, 4 m [13']",The Citizen,1999.01.13-Ridge.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,4079


In [353]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   case_number             5992 non-null   object
 1   Date                    5992 non-null   object
 2   Year                    5992 non-null   int64 
 3   Type                    5992 non-null   object
 4   Country                 5992 non-null   object
 5   Area                    5992 non-null   object
 6   Location                5992 non-null   object
 7   Activity                5992 non-null   object
 8   Name                    5992 non-null   object
 9   Sex                     5992 non-null   object
 10  Age                     5992 non-null   object
 11  Injury                  5992 non-null   object
 12  Fatal                   5992 non-null   object
 13  Time                    5992 non-null   object
 14  Species                 3058 non-null   object
 15  Inve