In [515]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher
import os

In [595]:
pd.set_option('display.max_rows', 150, 'display.min_rows',100, "max_colwidth", None)

In [517]:
df_sharks = pd.read_csv('GSAF5.csv', encoding='latin-1')

In [518]:
df_sharks.shape

(5992, 24)

In [519]:
df_sharks[df_sharks.duplicated(keep=False)] #No true duplicates

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [520]:
df_sharks.rename(columns={'Case Number':'case_number'},inplace=True) 

In [521]:
df_sharks.drop(labels=['Unnamed: 22','Unnamed: 23'], axis = 1, inplace=True)

In [522]:
for column in ['case_number','Case Number.1','Case Number.2']:
    df_sharks[column] = df_sharks.loc[:,column].str.replace(r'[\W]', '.',regex=True)

In [523]:
cases = df_sharks[(df_sharks['case_number'] != df_sharks['Case Number.1']) | (df_sharks['case_number']!=df_sharks['Case Number.2'])]

In [524]:
cases[['case_number','Case Number.1','Case Number.2']] # Not many differences, Case Number.1,2 most likely irrelevant

Unnamed: 0,case_number,Case Number.1,Case Number.2
4,2016.09.15,2016.09.16,2016.09.15
33,2016.07.14.4,2016.07.14.R,2016.07.14.4
97,2016.01.24.b,2015.01.24.b,2016.01.24.b
116,2015.12.23,2015.11.07,2015.12.23
121,2015.10.28.a,2015.10.28,2015.10.28.a
3654,1961.09.02.R,1961.09.06.R,1961.09.02.R
4177,1952.08.05,1952.08.04,1952.08.05
4719,1934.01.08.R,1934.02.08.R,1934.02.08.R
5043,1900.00.00.R,1919.00.00.R,1900.00.00.R
5150,1911.07.31.R,1911.07.31.T,1911.07.31.R


In [525]:
df_sharks.drop(labels=['Case Number.1','Case Number.2'], axis = 1, inplace=True)

In [526]:
df_sharks['Type'].value_counts()

Unprovoked      4386
Provoked         557
Invalid          519
Sea Disaster     220
Boat             200
Boating          110
Name: Type, dtype: int64

In [527]:
df_sharks['Type'].replace({'Invalid':'Unknown','Boat':'Boating'}, inplace=True) # Correct and unify

In [528]:
df_sharks['Type'].value_counts()

Unprovoked      4386
Provoked         557
Unknown          519
Boating          310
Sea Disaster     220
Name: Type, dtype: int64

In [529]:
df_sharks['Country'] = df_sharks['Country'].str.upper().replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes
df_sharks.loc[5991, 'Country'] = 'SRI LANKA' # Fixed minor mistake

In [530]:
df_sharks['Area'] = df_sharks['Area'].str.replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes

In [531]:
df_sharks.loc[df_sharks['Country'].isnull(), 'Country'] = 'IW or Unknown' #International Waters or unknown

In [532]:
df_sharks['Location'] = df_sharks['Location'].str.replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes
df_sharks[df_sharks['Location'].isnull()]

Unnamed: 0,case_number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,original order
21,2016.07.28.R,28-Jul-16,2016,Unprovoked,CHINA,Hong Kong,,Swimming,Justus Franz,M,72,Lacerations to leg,N,,,"Klassick, 7/28/2016",2016.07.28.R-Franz.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5972
42,2016.06.24,24-Jun-16,2016,Unprovoked,COLUMBIA,Isla Provedencia,,Scuba Diving,Arturo Velez,M,59,Severe bite to right hand,N,11h00,"Caribbean reef shark, 4.5'",Dr. A. Velez,2016.06.24-Velez.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5951
73,2016.04.08,08-Apr-16,2016,Unknown,CAPE VERDE,Boa Vista Island,,,a British citizen,M,60,"""Serious""",N,,Shark involvement not confirmed,L.O.Guttke,2016.04.08-CapeVerde.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5920
80,2016.03.26,26-Mar-16,2016,Provoked,BAHAMAS,,,,Henry Kreckman,M,9,Minor injury to chest PROVOKED INCIDENT,N,,"Nurse shark, 2.5-ft","Wisconsin State Journal, 4/2/2016",2016.03.26-Kreckman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5913
83,2016.03.10,10-Mar-16,2016,Unprovoked,FIJI,Vanua Levu,,Diving for beche-de-mer,Maika Tabua,M,45,FATAL,Y,Afternoon,,"Fiji Sun, 3/12/2016",2016.03.10-Tabua.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5910
94,2016.01.29,29-Jan-16,2016,Boating,SOUTH AFRICA,KwaZulu-Natal,,Kayak fishing,Dev De Lange,M,,"No injury, shark capsized kayak",N,,,"Nine News, 2/1/2016",2016.01.29-DeLange.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5899
151,2015.09.00,Sep-15,2015,Unprovoked,FIJI,,,Spearfishing,Viliame Lautiki,M,,Leg bitten,N,,Tiger shark,"Fiji Times, 2/8/2016",2015.09.00-Lautiki.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5841
171,2015.07.06,06-Jul-15,2015,Unknown,FRENCH POLYNESIA,Bora Bora,,Swimming,Joe Termini,M,,Parallel lacerations to torso inconsistent wit...,N,,No shark involvement,"Hollywood Life, 7/6/2015",2015.07.06-Termini.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5821
217,2015.03.29,29-Mar-15,2015,Unknown,ITALY,Sardinia,,Diving,Eugenio Masala,M,43,"FATAL, but shark involvement prior to death un...",Y,,Shark involvement not cofirmed,"A. de Maddalena, GSAF",2015.03.29-Masala.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5775
225,2015.02.15,15-Feb-15,2015,Boating,ATLANTIC OCEAN,,,Transatlantic Rowing,"Avalon, a carbon kevlar monohull: 8 occupants",,,"No injury, shark bit rudder",N,,White shark,"Yorkshire Post, 3/16/2014",2015.02.15-Avalon.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,5767


In [533]:
df_sharks.loc[df_sharks['Area'].isnull(), 'Area'] = 'Unspecified Area' #Replacing Nulls

In [534]:
df_sharks.loc[df_sharks['Location'].isnull(), 'Location'] = 'Unspecified Location' #Replacing Nulls

In [535]:
df_sharks['Activity'] = df_sharks['Activity'].str.strip().str.lower() # minor cleaning
df_sharks.loc[df_sharks['Activity'].isnull(), 'Activity'] = 'unspecified activity' #Replacing Nulls

df_sharks.loc[df_sharks['Activity'].str.contains('fish'), 'Activity'] = 'fishing'
df_sharks.loc[df_sharks['Activity'].str.contains('divi'), 'Activity'] = 'diving'
df_sharks.loc[df_sharks['Activity'].str.contains('dive'), 'Activity'] = 'diving'
df_sharks.loc[df_sharks['Activity'].str.contains('swim'), 'Activity'] = 'swimming'
df_sharks.loc[df_sharks['Activity'].str.contains('surf'), 'Activity'] = 'surfing'
df_sharks.loc[df_sharks['Activity'].str.contains('unkn'), 'Activity'] = 'unspecified activity'
df_sharks.loc[df_sharks['Activity'].str.contains('boarding'), 'Activity'] = 'boarding'
df_sharks.loc[df_sharks['Activity'].str.contains('capsi'), 'Activity'] = 'boat capsized'
df_sharks.loc[df_sharks['Activity'].str.contains('saili'), 'Activity'] = 'boating'
df_sharks.loc[df_sharks['Activity'].str.contains('bath'), 'Activity'] = 'bathing'
df_sharks.loc[df_sharks['Activity'].str.contains('overboard'), 'Activity'] = 'overboard'
df_sharks.loc[df_sharks['Activity'].str.contains('shark'), 'Activity'] = 'shark interaction'   # A person deliberately interactred with a shark
df_sharks.loc[df_sharks['Activity'].str.contains('boat'), 'Activity'] = 'boat accident'
df_sharks.loc[df_sharks['Activity'].str.contains('wreck'), 'Activity'] = 'boat accident'
df_sharks.loc[df_sharks['Activity'].str.contains('net'), 'Activity'] = 'fishing'
df_sharks.loc[df_sharks['Activity'].str.contains('float'), 'Activity'] = 'floating'

df_sharks['Activity'].value_counts()

fishing                                                                                                                                         1199
surfing                                                                                                                                         1120
swimming                                                                                                                                        1050
unspecified activity                                                                                                                             532
diving                                                                                                                                           524
bathing                                                                                                                                          179
boat accident                                                                                             

In [536]:
'''
# Used
substring_counts={}
names = list(df_sharks['Activity'].value_counts().index)
for i in range(0, len(names)):
    for j in range(i+1,len(names)):
        string1 = names[i]
        string2 = names[j]
        match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
        matching_substring=string1[match.a:match.a+match.size]
        if(matching_substring not in substring_counts):
            substring_counts[matching_substring]=1
        else:
            substring_counts[matching_substring]+=1

print(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}
'''

{'ing': 8478, 'fi': 27, 'hing': 82, 'i': 222, 'sh': 187, '': 838, 'is': 79, 'in': 3143, 'shing': 31, 's': 624, 'shi': 239, 'ish': 8, 'f': 123, 'h': 88, 'hi': 133, 'ng': 140, 'hin': 5, 'r': 306, 'ur': 135, 'su': 90, 'sur': 11, 'u': 180, 'rf': 1, 'm': 173, 'mming': 1, 'im': 6, 'wi': 28, 'sw': 7, 'w': 36, 'ming': 13, 'n': 243, 'imm': 2, 'min': 12, 'ivi': 2, ' ac': 6, 'acti': 2, 'e': 296, 'ti': 153, 'sp': 5, 'ed ': 887, 'if': 61, 'pe': 48, 'un': 72, ' a': 334, 'ci': 24, 'ie': 13, 'cti': 29, 'nsp': 5, 'ns': 10, 'fie': 1, 'ed a': 9, 'd ': 77, 'ec': 11, 'ied ': 2, 'cifi': 2, 'it': 84, 'ac': 68, 'iv': 11, 'ed': 215, 'ty': 2, 'act': 1, 'ifi': 3, 'uns': 1, 'spe': 3, 'ified ': 1, 'd a': 6, 'speci': 2, 'vi': 14, 'd': 161, 'di': 215, 'ving': 5, 'vin': 4, 'at': 298, 'b': 44, 'a': 1011, 'ba': 34, 'th': 163, 'ath': 7, 'bat': 3, 'thing': 1, 't': 221, 'boa': 21, 'o': 207, 'oat': 1, 'de': 294, 'c': 156, 'bo': 36, 't ': 222, 't a': 21, 'ent': 40, 'cide': 4, 'den': 6, 'at ': 16, 'ide': 33, ' ': 64, 'en': 3

In [537]:
#{k: v for k, v in sorted(substring_counts.items(), key=lambda item: item[1],reverse=True)}

{'ing ': 11749,
 'ing': 8478,
 'in': 3143,
 ' the ': 2834,
 'a': 1011,
 'ed ': 887,
 '': 838,
 'the ': 814,
 'er': 798,
 ' water': 728,
 'ter': 723,
 ' in ': 713,
 'er ': 634,
 's': 624,
 'an': 558,
 ' in': 524,
 ' s': 508,
 'ding': 502,
 ' wa': 493,
 'e ': 429,
 'n ': 402,
 ' on ': 400,
 'ting ': 383,
 'ar': 342,
 ' a': 334,
 'ra': 334,
 ' o': 323,
 ' of ': 318,
 'ri': 317,
 'the': 315,
 'r': 306,
 'ter ': 305,
 'ing o': 300,
 'at': 298,
 'e': 296,
 'wa': 296,
 'de': 294,
 ' a ': 284,
 'on ': 281,
 'ding ': 245,
 'n': 243,
 'shi': 239,
 'he': 232,
 'i': 222,
 't ': 222,
 't': 221,
 'ing in ': 220,
 'ed': 215,
 'di': 215,
 'water': 215,
 ' with ': 211,
 'o': 207,
 'ad': 194,
 'te': 191,
 'ro': 189,
 'ship ': 188,
 'sh': 187,
 'u': 180,
 'm': 173,
 'ing a': 173,
 ' from ': 171,
 'and': 164,
 'th': 163,
 'd': 161,
 'e s': 158,
 'c': 156,
 'p': 156,
 'ti': 153,
 'a ': 153,
 'li': 151,
 'st': 149,
 ' d': 149,
 'ing t': 147,
 'ng ': 146,
 'to ': 145,
 're': 142,
 'ng': 140,
 'll': 138,
 'ch

In [538]:
df_sharks.loc[df_sharks['Name'].isnull(), 'Name'] = 'Unknown' #Replacing Nulls
df_sharks['Name'] = df_sharks['Name'].str.strip()

In [539]:
indexes_lower = df_sharks['Name'].value_counts()[df_sharks['Name'].value_counts().index.str.islower()].index
list(indexes_lower)
df_sharks.loc[df_sharks['Name'].isin(indexes_lower), 'Name'] = 'Unknown'
df_sharks.loc[df_sharks['Name'].str.contains('Anonymous|Unidentified|Arab boy'), 'Name'] = 'Unknown'

In [540]:
comma_name = df_sharks.loc[df_sharks['Name'].str.contains(r'^[A-Z].*?,\ '), 'Name'].str.extract(r'(^[A-Z].*?),\ ')[0]   #Cleans format: Name Surname, description
df_sharks['Name'].update(comma_name)

In [541]:
occupant = df_sharks.loc[:,'Name'].str.extract(r'ccupan.*?([A-Z].*)')[0] #Cleans format: occupants: Name, (maybe more stuff)
df_sharks['Name'].update(occupant)

In [542]:
parenth = df_sharks.loc[:,'Name'].str.extract(r'([A-Z].*?)\ \((?:fe)?male')[0] #Cleans format: occupants: Name (male/female)
df_sharks['Name'].update(parenth)

In [543]:
df_sharks.loc[df_sharks['Name'].str.contains('\d+\'|boat|yacht'), 'Name'] = 'Boat'

In [544]:
df_sharks.loc[df_sharks['Name'].str.contains('male|diver|occup|^A |boy|girl|^a |^an |sailor|^\d'), 'Name'] = 'Unknown Name' # After the other stuff is clean, we can make several key words unknown

In [545]:
df_sharks.loc[df_sharks['Name'].str.contains(r'boat'), 'Name']

Series([], Name: Name, dtype: object)

In [546]:
df_sharks.rename(columns={'Sex ':'Sex'}, inplace=True)  #Cleaning column name
df_sharks['Sex'].value_counts()

M      4835
F       585
M         2
lli       1
N         1
.         1
Name: Sex, dtype: int64

In [547]:
df_sharks.loc[df_sharks['Sex'].isnull(), 'Sex'] = 'Unknown Sex' #Replacing Nulls
df_sharks.loc[df_sharks['Sex'].str.contains('N'), 'Sex'] = 'M' # Assuming N was intended to be M
df_sharks.loc[df_sharks['Sex'].str.contains('lli|\.'), 'Sex'] = 'Unknown Sex' # Eliminating mistakes

In [548]:
df_sharks['Sex'] = df_sharks['Sex'].str.strip()
df_sharks['Sex'].value_counts() # Clean

M              4838
F               585
Unknown Sex     569
Name: Sex, dtype: int64

In [549]:
df_sharks['Age'] = df_sharks['Age'].astype(str)

In [550]:
df_sharks.loc[df_sharks['Age'].isnull(), 'Age'] = 'Unknown Age' #Replacing Nulls

In [551]:
df_sharks.loc[df_sharks['Age'].str.contains('months'), 'Age'] = '1'
df_sharks.loc[df_sharks['Age'].str.contains('een'), 'Age'] = '15'
df_sharks.loc[df_sharks['Age'].str.contains('^$'), 'Age'] = 'Unknown Age'
df_sharks.loc[df_sharks['Age'].str.contains(r',|&'), 'Age'] = 'Unknown Age'
df_sharks.loc[df_sharks['Age'].str.contains(r'[a-zA-Z]', regex=True), 'Age'] = 'Unknown Age'
df_sharks['Age'] = df_sharks['Age'].str.rstrip('s')
df_sharks['Age'] = df_sharks['Age'].str.rstrip('\'')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('mid-')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('Ca. ')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('>')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('Both ')
df_sharks['Age'] = df_sharks['Age'].str.rstrip('½')
df_sharks['Age'] = df_sharks['Age'].str.strip('?')
df_sharks['Age'] = df_sharks['Age'].str.strip()

In [552]:
or_clean = df_sharks.loc[:,'Age'].str.extract(r'(\d+)\ or')[0] #Cleans format: age or age and keeps the first age
df_sharks['Age'].update(or_clean)

In [553]:
to_clean = df_sharks.loc[:,'Age'].str.extract(r'(\d+)\ to')[0] #Cleans format: age to age and keeps the first age
df_sharks['Age'].update(to_clean)

In [554]:
df_sharks.loc[df_sharks['Injury'].isnull(), 'Injury'] = 'No details' #Replacing Nulls

In [555]:
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(?:Presumed\ )?(?:FATAL),?\ (.*)')[0] #Cleans format: FATAL 
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(.*?),?\ FATAL')[0]
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(FATAL)(\.\ .*)?')[0] 
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(.*?)PROVOKED')[0] 
df_sharks['Injury'].update(fatal)

In [556]:
df_sharks.loc[df_sharks['Injury'].str.contains('^$'), 'Injury'] = 'No details'
df_sharks.loc[df_sharks['Injury'].str.contains(r'No injury|No Injury|no injury|no Injury', regex=True), 'Injury'] = 'No injury'
df_sharks.loc[df_sharks['Injury'].str.contains(r'FATAL|Survived', regex=True), 'Injury'] = 'No details'
df_sharks['Injury'] = df_sharks['Injury'].str.strip()

In [557]:
df_sharks.rename(columns={'Fatal (Y/N)':'Fatal'}, inplace=True)  #Cleaning column name
df_sharks['Fatal'].value_counts()

N          4315
Y          1552
UNKNOWN      94
 N            8
F             1
N             1
#VALUE!       1
n             1
Name: Fatal, dtype: int64

In [558]:
df_sharks.loc[df_sharks['Fatal'].isnull(), 'Fatal'] = 'UNKNOWN' #Replacing Nulls
df_sharks['Fatal'] = df_sharks['Fatal'].str.strip()
df_sharks.loc[df_sharks['Fatal'].str.contains(r'n', regex=True), 'Fatal'] = 'N'
df_sharks.loc[df_sharks['Fatal'].str.contains(r'F|#VALUE!', regex=True), 'Fatal'] = 'UNKNOWN'

In [559]:
df_sharks.loc[df_sharks['Time'].isnull(), 'Time'] = 'Unknown Time' #Replacing Nulls
df_sharks.loc[df_sharks['Time'].str.contains('8:04'), 'Time'] = '20h04'
df_sharks.loc[df_sharks['Time'].str.contains('15j45'), 'Time'] = '15h45'
df_sharks.loc[df_sharks['Time'].str.contains('06j00'), 'Time'] = '06h00'
df_sharks.loc[df_sharks['Time'].str.contains('[abcdefgijklmnopqrstuvwxyz]'), 'Time'] = 'Unknown Time'

In [560]:
hours = df_sharks.loc[:,'Time'].str.extract(r'(\d{2}h\d{2})')[0] # Extracts hours from other strings and keeps them 
df_sharks['Time'].update(hours)

In [572]:
df_sharks.rename(columns={'Species ':'Species'}, inplace=True)  #Cleaning column name
df_sharks['Species'].value_counts()

unknown species                      4093
white shark                           615
tiger shark                           259
bull shark                            166
nurse shark                            94
reef shark                             66
blacktip shark                         65
whaler shark                           65
mako shark                             54
raggedtooth shark                      44
hammerhead shark                       44
spinner shark                          43
wobbegong shark                        42
blue shark                             41
lemon shark                            34
zambesi shark                          32
whitetip shark                         21
sand shark                             19
sandtiger shark                        19
dusky shark                            16
sevengill shark                        10
carpet shark                            8
porbeagle shark                         7
copper shark                      

In [571]:
df_sharks.loc[df_sharks['Species'].isnull(), 'Species'] = 'Unknown Species' #Replacing Nulls
df_sharks['Species'] = df_sharks['Species'].str.strip().str.lower()
species = df_sharks.loc[:,'Species'].str.extract(r'\ ?(\w+\"?\ shark)')[0] # Looks for ____ shark
df_sharks['Species'].update(species)
df_sharks.loc[df_sharks['Species'].str.contains(r'cat|canic', regex=True), 'Species'] = 'catshark'
df_sharks.loc[df_sharks['Species'].str.contains(r'gange', regex=True), 'Species'] = 'gangeticus'
df_sharks.loc[df_sharks['Species'].str.contains(r'iden|vici|confirm|^$', regex=True), 'Species'] = 'unknown species'
df_sharks.loc[df_sharks['Species'].str.contains(r'black', regex=True), 'Species'] = 'blacktip shark'
df_sharks.loc[df_sharks['Species'].str.contains(r'\d|m\ shark|lb\ shark|no\ shark|kg\ shark|of\ shark|colored\ shark|a\ shark|juvenile|two|from|small|involv|as\ shark|several|larg|hull|little|later|skin|bite|girth|for\ shark|sting|unknown', regex=True), 'Species'] = 'unknown species'
df_sharks['Species'] = df_sharks.loc[:,'Species'].str.replace(r'\"', '',regex=True)

In [578]:
df_sharks.rename(columns={'Investigator or Source':'Source'}, inplace=True)  #Cleaning column name
df_sharks['Source'].value_counts()

C. Moore, GSAF                                                                                                              94
S. Petersohn, GSAF                                                                                                          82
C. Creswell, GSAF                                                                                                           81
R. Collier                                                                                                                  54
T. Peake, GSAF                                                                                                              48
M. Levine, GSAF                                                                                                             45
R. Collier, GSAF                                                                                                            35
A. Gifford, GSAF                                                                                               

In [610]:
df_sharks['pdf'] = df_sharks['pdf'].str.strip()
df_sharks['pdf'].value_counts()

1907.10.16.R-HongKong.pdf                       2
1916.12.08.a-b-German.pdf                       2
1929.03.04.a-b.Roads-Aldridge.pdf               2
1898.00.00.R-Syria.pdf                          2
1923.00.00.a-NJ fisherman.pdf                   2
1931.09.21.a-b-Holaday-Barrows.pdf              2
1921.11.27.a-b-Jack.pdf                         2
1906.09.27.R.a&b-Munich-Swede.pdf               2
1934.12.23.a-b-Inman.pdf                        2
1935.06.05.R-SolomonIslands.pdf                 2
1916.07.12.a-b-Stillwell-Fisher.pdf             2
1957.00.00.g-Bolster.pdf                        1
1957.00.00.h-boat-Portuondo.pdf                 1
1957.00.00.f-Kline.pdf                          1
1957.00.00.i-baby                               1
1957.00.00.j-NV-Nauth.pdf                       1
1957.00.00.k-Fernando.pdf                       1
2016.09.18.c-NSB.pdf                            1
1957.01.05-Williams.pdf                         1
1957.02.00-Straughan.pdf                        1


In [601]:
df_sharks.rename(columns={'href formula':'href_formula'}, inplace=True)
df_sharks.loc[3019,'href_formula'] = df_sharks.loc[3019,'href'] #Only NaN in formula that is fine on href

In [598]:
diff_href = df_sharks[df_sharks['href'] != df_sharks['href formula']]
diff_href[['href formula','href']]

Unnamed: 0,href formula,href
20,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.29-Spain.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.29-Spain.pdf
27,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23.a-Cutbirth.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23-Cutbirth.pdf
61,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.a-Girl.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.a-Girl.pdf
107,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf
114,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf
134,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.10.05.a-Slaughter.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.10.05-Slaughter.pdf
180,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.06.27.a-Swanepoel.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.06.27-Swanepoel.pdf
193,http://sharkattackfile.net/spreadsheets/pdf_directory/Court Case pending,
232,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.01.24-Murray.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.01.24-Murray.pdf
262,http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.19-Tara.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.19-Tara.pdf


In [603]:
df_sharks.drop(labels=['href'], axis = 1, inplace=True) # Almost identical to href_formula

In [615]:
df_sharks.sample(100)['href_formula'].value_counts()

http://sharkattackfile.net/spreadsheets/pdf_directory/2002.11.17-Glance.pdf                          1
http://sharkattackfile.net/spreadsheets/pdf_directory/2006.09.03.a-Duncan.pdf                        1
http://sharkattackfile.net/spreadsheets/pdf_directory/1906.09.27.R.a&b-Munich-Swede.pdf              1
http://sharkattackfile.net/spreadsheets/pdf_directory/1938.06.08-Edwards.pdf                         1
http://sharkattackfile.net/spreadsheets/pdf_directory/1966.07.31-Russell.pdf                         1
http://sharkattackfile.net/spreadsheets/pdf_directory/2001.08.27-Goettel.pdf                         1
http://sharkattackfile.net/spreadsheets/pdf_directory/1888.10.23.R-Croatia. Pdf                      1
http://sharkattackfile.net/spreadsheets/pdf_directory/1959.10.00-Etuate.pdf                          1
http://sharkattackfile.net/spreadsheets/pdf_directory/1959.04.09.a-Fisherman.pdf                     1
http://sharkattackfile.net/spreadsheets/pdf_directory/1968.00.00.b-NV-Key

In [616]:
df_sharks.loc[df_sharks['href_formula'].str.contains('Pdf'), 'href_formula'].value_counts()

http://sharkattackfile.net/spreadsheets/pdf_directory/1888.10.23.R-Croatia. Pdf    1
Name: href_formula, dtype: int64

In [602]:
df_sharks.sample(50)

Unnamed: 0,case_number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href_formula,href,original order
4372,1946.05.09,09-May-46,1946,Unprovoked,TANZANIA,Dar-es-Salaam,Dar-es-Salaam harbor,swimming,W. Svendson,M,Unknown Age,Leg bitten,N,Unknown Time,unknown species,"Star, 5/20/1946",1946.05.09-Svendson.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1946.05.09-Svendson.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1946.05.09-Svendson.pdf,1621
3859,1959.07.03.a...b,03-Jul-59,1959,Sea Disaster,CARIBBEAN SEA,PANAMA,"Off Cristobal, 200 miles northeast of the entrance to the Panama Canal",columbian petrol barge rio atrato burned and sank,Teresea Britton (on raft) & a man (on floating debris),Unknown Sex,27,"X 2, 8 others missing. Survivors fought off sharks & sharks seen biting 2 of the dead. The 39 survivors were rescued by the German freighter Essen",Y,Unknown Time,unknown species,"Cambridge Daily News, 7/4/1959; V. M. Coppleson (1962), p. 259",1959.07.03-Rio-Atranto.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1959.07.03-Rio-Atranto.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1959.07.03-Rio-Atranto.pdf,2134
148,2015.09.04,04-Sep-15,2015,Unprovoked,AUSTRALIA,New South Wales,Hallidays Point,surfing,David Quinliven,M,62,Inuries to lower left leg & ankle,N,11h30,white shark,"The Sydney Morning Herald, 9/4/2015",2015.09.04-Quinliven.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.09.04-Quinliven.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.09.04-Quinliven.pdf,5844
90,2016.02.10.R,Reported 10-Feb-2016,2016,Unknown,CAYMAN ISLANDS,Grand Cayman,Stingray City Bar,feeding stingrays?,Richard Branson,M,65,Minor injury to wrist from Southern stingray,N,Unknown Time,unknown species,R. Branson,2016.02.10.R-Branson-stingray.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.02.10.R-Branson-stingray.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.02.10.R-Branson-stingray.pdf,5903
1838,2000.03.26,26-Mar-00,2000,Unprovoked,USA,Florida,"Juno Beach, Palm Beach County",boarding,Heather Van Olst,F,14,Right knee lacerated,N,11h15,unknown species,"Stuart News, 3/28/2000; Jupiter Couier, 3/29/2000",2000.03.26-VanOlst.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2000.03.26-VanOlst.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2000.03.26-VanOlst.pdf,4155
3877,1959.03.29,29-Mar-59,1959,Unprovoked,USA,Florida,"Vaca Cut Channel & bridge under Marathon, Monroe County",swimming,James McKee,M,13,"Bumped, then knee bitten",N,14h30,unknown species,"Florida Keys Keynoter, 4/2/1959; Dr. H. S. Denniger; Note: V.M. Coppleson (1962) lists date of May 1959, p249",1959.03.29-McKee.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1959.03.29-McKee.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1959.03.29-McKee.pdf,2116
592,2012.03.01,01-Mar-12,2012,Provoked,CHILE,Antofagasta Province,Antofagasta,fishing,Paye León Salomón,M,Unknown Age,Hand injured,N,Unknown Time,unknown species,"Soychile.cl, 3/1/2012",2012.03.01-Salomon.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2012.03.01-Salomon.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2012.03.01-Salomon.pdf,5401
362,2014.02.07,07-Feb-14,2014,Unprovoked,NEW ZEALAND,South Island,Porpoise Bay,surfing,Darren Mills,M,28,Lacerations to leg,N,20h30,gill shark,"New Zealand Herald, 2/7/2014",2014.02.07-PorpoiseBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2014.02.07-PorpoiseBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/w014.01.25-Grant.pdf,5631
1521,2003.09.13.b,13-Sep-03,2003,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",boarding,Aaron Edelson,M,18,Left calf avulsion,N,17h11,unknown species,"S. Petersohn, GSAF; News Journal Online; Orlando Sentinel, 9/15/2003, p.B2; R. Weiss, Daytona Beach News Journal, 9/15/2001, p.3C",2003.09.13.b-Edelson.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.09.13.b-Edelson.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.09.13.b-Edelson.pdf,4472
2604,1986.02.01,01-Feb-86,1986,Unprovoked,AUSTRALIA,Victoria,Jan Juc Beach,surfing,David Adams,M,29,No injury,N,06h00,whaler shark,"Sun, 2/3/1986, p.4",1986.02.01-Adams.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1986.02.01-Adams.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1986.02.01-Adams.pdf,3390


In [604]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5992 entries, 0 to 5991
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   case_number     5992 non-null   object
 1   Date            5992 non-null   object
 2   Year            5992 non-null   int64 
 3   Type            5992 non-null   object
 4   Country         5992 non-null   object
 5   Area            5992 non-null   object
 6   Location        5992 non-null   object
 7   Activity        5992 non-null   object
 8   Name            5992 non-null   object
 9   Sex             5992 non-null   object
 10  Age             5992 non-null   object
 11  Injury          5992 non-null   object
 12  Fatal           5992 non-null   object
 13  Time            5992 non-null   object
 14  Species         5992 non-null   object
 15  Source          5992 non-null   object
 16  pdf             5992 non-null   object
 17  href_formula    5992 non-null   object
 18  original

### Output

In [600]:
df_sharks.to_csv(os.path.dirname(os.path.abspath('__file__'))+'\\'+'Output_File.csv')