In [1]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher
import os

In [2]:
pd.set_option('display.max_rows', 150, 'display.min_rows',100, "max_colwidth", None)

In [3]:
df_sharks = pd.read_csv('GSAF5.csv', encoding='latin-1')

In [4]:
df_sharks.shape

(5992, 24)

In [5]:
df_sharks[df_sharks.duplicated(keep=False)] #No true duplicates

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23


In [6]:
df_sharks.rename(columns={'Case Number':'case_number'},inplace=True) 

In [7]:
df_sharks.drop(labels=['Unnamed: 22','Unnamed: 23'], axis = 1, inplace=True)

In [8]:
for column in ['case_number','Case Number.1','Case Number.2']:
    df_sharks[column] = df_sharks.loc[:,column].str.replace(r'[\W]', '.',regex=True)

In [9]:
cases = df_sharks[(df_sharks['case_number'] != df_sharks['Case Number.1']) | (df_sharks['case_number']!=df_sharks['Case Number.2'])]

In [10]:
cases[['case_number','Case Number.1','Case Number.2']] # Not many differences, Case Number.1,2 most likely irrelevant

Unnamed: 0,case_number,Case Number.1,Case Number.2
4,2016.09.15,2016.09.16,2016.09.15
33,2016.07.14.4,2016.07.14.R,2016.07.14.4
97,2016.01.24.b,2015.01.24.b,2016.01.24.b
116,2015.12.23,2015.11.07,2015.12.23
121,2015.10.28.a,2015.10.28,2015.10.28.a
3654,1961.09.02.R,1961.09.06.R,1961.09.02.R
4177,1952.08.05,1952.08.04,1952.08.05
4719,1934.01.08.R,1934.02.08.R,1934.02.08.R
5043,1900.00.00.R,1919.00.00.R,1900.00.00.R
5150,1911.07.31.R,1911.07.31.T,1911.07.31.R


In [11]:
df_sharks.drop(labels=['Case Number.1','Case Number.2'], axis = 1, inplace=True)

In [12]:
df_sharks['Type'].value_counts()

Unprovoked      4386
Provoked         557
Invalid          519
Sea Disaster     220
Boat             200
Boating          110
Name: Type, dtype: int64

In [13]:
df_sharks['Type'].replace({'Invalid':'Unknown','Boat':'Boating'}, inplace=True) # Correct and unify

In [14]:
df_sharks['Type'].value_counts()

Unprovoked      4386
Provoked         557
Unknown          519
Boating          310
Sea Disaster     220
Name: Type, dtype: int64

In [15]:
df_sharks['Country'] = df_sharks['Country'].str.upper().replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes
df_sharks.loc[5991, 'Country'] = 'SRI LANKA' # Fixed minor mistake

In [16]:
df_sharks['Area'] = df_sharks['Area'].str.replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes

In [17]:
df_sharks.loc[df_sharks['Country'].isnull(), 'Country'] = 'IW or Unknown' #International Waters or unknown

In [18]:
df_sharks['Location'] = df_sharks['Location'].str.replace(r'^\s', '', regex=True).replace(r'\?', '', regex=True) #Fixed some minor mistakes
df_sharks[df_sharks['Location'].isnull()]

Unnamed: 0,case_number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,original order
21,2016.07.28.R,28-Jul-16,2016,Unprovoked,CHINA,Hong Kong,,Swimming,Justus Franz,M,72,Lacerations to leg,N,,,"Klassick, 7/28/2016",2016.07.28.R-Franz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.28.R-Franz.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.28.R-Franz.pdf,5972
42,2016.06.24,24-Jun-16,2016,Unprovoked,COLUMBIA,Isla Provedencia,,Scuba Diving,Arturo Velez,M,59,Severe bite to right hand,N,11h00,"Caribbean reef shark, 4.5'",Dr. A. Velez,2016.06.24-Velez.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.06.24-Velez.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.06.24-Velez.pdf,5951
73,2016.04.08,08-Apr-16,2016,Unknown,CAPE VERDE,Boa Vista Island,,,a British citizen,M,60,"""Serious""",N,,Shark involvement not confirmed,L.O.Guttke,2016.04.08-CapeVerde.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.04.08-CapeVerde.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.04.08-CapeVerde.pdf,5920
80,2016.03.26,26-Mar-16,2016,Provoked,BAHAMAS,,,,Henry Kreckman,M,9,Minor injury to chest PROVOKED INCIDENT,N,,"Nurse shark, 2.5-ft","Wisconsin State Journal, 4/2/2016",2016.03.26-Kreckman.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.03.26-Kreckman.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.03.26-Kreckman.pdf,5913
83,2016.03.10,10-Mar-16,2016,Unprovoked,FIJI,Vanua Levu,,Diving for beche-de-mer,Maika Tabua,M,45,FATAL,Y,Afternoon,,"Fiji Sun, 3/12/2016",2016.03.10-Tabua.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.03.10-Tabua.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.03.10-Tabua.pdf,5910
94,2016.01.29,29-Jan-16,2016,Boating,SOUTH AFRICA,KwaZulu-Natal,,Kayak fishing,Dev De Lange,M,,"No injury, shark capsized kayak",N,,,"Nine News, 2/1/2016",2016.01.29-DeLange.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.01.29-DeLange.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.01.29-DeLange.pdf,5899
151,2015.09.00,Sep-15,2015,Unprovoked,FIJI,,,Spearfishing,Viliame Lautiki,M,,Leg bitten,N,,Tiger shark,"Fiji Times, 2/8/2016",2015.09.00-Lautiki.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.09.00-Lautiki.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.09.00-Lautiki.pdf,5841
171,2015.07.06,06-Jul-15,2015,Unknown,FRENCH POLYNESIA,Bora Bora,,Swimming,Joe Termini,M,,Parallel lacerations to torso inconsistent with shark bite,N,,No shark involvement,"Hollywood Life, 7/6/2015",2015.07.06-Termini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.07.06-Termini.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.07.06-Termini.pdf,5821
217,2015.03.29,29-Mar-15,2015,Unknown,ITALY,Sardinia,,Diving,Eugenio Masala,M,43,"FATAL, but shark involvement prior to death unconfirmed",Y,,Shark involvement not cofirmed,"A. de Maddalena, GSAF",2015.03.29-Masala.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.03.29-Masala.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.03.29-Masala.pdf,5775
225,2015.02.15,15-Feb-15,2015,Boating,ATLANTIC OCEAN,,,Transatlantic Rowing,"Avalon, a carbon kevlar monohull: 8 occupants",,,"No injury, shark bit rudder",N,,White shark,"Yorkshire Post, 3/16/2014",2015.02.15-Avalon.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.02.15-Avalon.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.02.15-Avalon.pdf,5767


In [19]:
df_sharks.loc[df_sharks['Area'].isnull(), 'Area'] = 'Unspecified Area' #Replacing Nulls

In [20]:
df_sharks.loc[df_sharks['Location'].isnull(), 'Location'] = 'Unspecified Location' #Replacing Nulls

In [21]:
df_sharks['Activity'] = df_sharks['Activity'].str.strip().str.lower() # minor cleaning
df_sharks.loc[df_sharks['Activity'].isnull(), 'Activity'] = 'unspecified activity' #Replacing Nulls

df_sharks.loc[df_sharks['Activity'].str.contains('fish'), 'Activity'] = 'fishing'
df_sharks.loc[df_sharks['Activity'].str.contains('divi'), 'Activity'] = 'diving'
df_sharks.loc[df_sharks['Activity'].str.contains('dive'), 'Activity'] = 'diving'
df_sharks.loc[df_sharks['Activity'].str.contains('swim'), 'Activity'] = 'swimming'
df_sharks.loc[df_sharks['Activity'].str.contains('surf'), 'Activity'] = 'surfing'
df_sharks.loc[df_sharks['Activity'].str.contains('unkn'), 'Activity'] = 'unspecified activity'
df_sharks.loc[df_sharks['Activity'].str.contains('boarding'), 'Activity'] = 'boarding'
df_sharks.loc[df_sharks['Activity'].str.contains('capsi'), 'Activity'] = 'boat capsized'
df_sharks.loc[df_sharks['Activity'].str.contains('saili'), 'Activity'] = 'boating'
df_sharks.loc[df_sharks['Activity'].str.contains('bath'), 'Activity'] = 'bathing'
df_sharks.loc[df_sharks['Activity'].str.contains('overboard'), 'Activity'] = 'overboard'
df_sharks.loc[df_sharks['Activity'].str.contains('shark'), 'Activity'] = 'shark interaction'   # A person deliberately interactred with a shark
df_sharks.loc[df_sharks['Activity'].str.contains('boat'), 'Activity'] = 'boat accident'
df_sharks.loc[df_sharks['Activity'].str.contains('wreck'), 'Activity'] = 'boat accident'
df_sharks.loc[df_sharks['Activity'].str.contains('net'), 'Activity'] = 'fishing'
df_sharks.loc[df_sharks['Activity'].str.contains('float'), 'Activity'] = 'floating'

# df_sharks['Activity'].value_counts()

fishing                                                                                                                                         1199
surfing                                                                                                                                         1120
swimming                                                                                                                                        1050
unspecified activity                                                                                                                             532
diving                                                                                                                                           524
bathing                                                                                                                                          179
boat accident                                                                                             

In [22]:
'''
# Used
substring_counts={}
names = list(df_sharks['Activity'].value_counts().index)
for i in range(0, len(names)):
    for j in range(i+1,len(names)):
        string1 = names[i]
        string2 = names[j]
        match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
        matching_substring=string1[match.a:match.a+match.size]
        if(matching_substring not in substring_counts):
            substring_counts[matching_substring]=1
        else:
            substring_counts[matching_substring]+=1

print(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}
'''

"\n# Used\nsubstring_counts={}\nnames = list(df_sharks['Activity'].value_counts().index)\nfor i in range(0, len(names)):\n    for j in range(i+1,len(names)):\n        string1 = names[i]\n        string2 = names[j]\n        match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))\n        matching_substring=string1[match.a:match.a+match.size]\n        if(matching_substring not in substring_counts):\n            substring_counts[matching_substring]=1\n        else:\n            substring_counts[matching_substring]+=1\n\nprint(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}\n"

In [23]:
#{k: v for k, v in sorted(substring_counts.items(), key=lambda item: item[1],reverse=True)}

In [24]:
df_sharks.loc[df_sharks['Name'].isnull(), 'Name'] = 'Unknown' #Replacing Nulls
df_sharks['Name'] = df_sharks['Name'].str.strip()

In [25]:
indexes_lower = df_sharks['Name'].value_counts()[df_sharks['Name'].value_counts().index.str.islower()].index
list(indexes_lower)
df_sharks.loc[df_sharks['Name'].isin(indexes_lower), 'Name'] = 'Unknown'
df_sharks.loc[df_sharks['Name'].str.contains('Anonymous|Unidentified|Arab boy'), 'Name'] = 'Unknown'

In [26]:
comma_name = df_sharks.loc[df_sharks['Name'].str.contains(r'^[A-Z].*?,\ '), 'Name'].str.extract(r'(^[A-Z].*?),\ ')[0]   #Cleans format: Name Surname, description
df_sharks['Name'].update(comma_name)

In [27]:
occupant = df_sharks.loc[:,'Name'].str.extract(r'ccupan.*?([A-Z].*)')[0] #Cleans format: occupants: Name, (maybe more stuff)
df_sharks['Name'].update(occupant)

In [28]:
parenth = df_sharks.loc[:,'Name'].str.extract(r'([A-Z].*?)\ \((?:fe)?male')[0] #Cleans format: occupants: Name (male/female)
df_sharks['Name'].update(parenth)

In [29]:
df_sharks.loc[df_sharks['Name'].str.contains('\d+\'|boat|yacht'), 'Name'] = 'Boat'

In [30]:
df_sharks.loc[df_sharks['Name'].str.contains('male|diver|occup|^A |boy|girl|^a |^an |sailor|^\d'), 'Name'] = 'Unknown Name' # After the other stuff is clean, we can make several key words unknown

In [31]:
df_sharks.loc[df_sharks['Name'].str.contains(r'boat'), 'Name']

Series([], Name: Name, dtype: object)

In [32]:
df_sharks.rename(columns={'Sex ':'Sex'}, inplace=True)  #Cleaning column name
df_sharks['Sex'].value_counts()

M      4835
F       585
M         2
lli       1
N         1
.         1
Name: Sex, dtype: int64

In [33]:
df_sharks.loc[df_sharks['Sex'].isnull(), 'Sex'] = 'Unknown Sex' #Replacing Nulls
df_sharks.loc[df_sharks['Sex'].str.contains('N'), 'Sex'] = 'M' # Assuming N was intended to be M
df_sharks.loc[df_sharks['Sex'].str.contains('lli|\.'), 'Sex'] = 'Unknown Sex' # Eliminating mistakes

In [34]:
df_sharks['Sex'] = df_sharks['Sex'].str.strip()
df_sharks['Sex'].value_counts() # Clean

M              4838
F               585
Unknown Sex     569
Name: Sex, dtype: int64

In [35]:
df_sharks['Age'] = df_sharks['Age'].astype(str)

In [36]:
df_sharks.loc[df_sharks['Age'].isnull(), 'Age'] = 'Unknown Age' #Replacing Nulls

In [37]:
df_sharks.loc[df_sharks['Age'].str.contains('months'), 'Age'] = '1'
df_sharks.loc[df_sharks['Age'].str.contains('een'), 'Age'] = '15'
df_sharks.loc[df_sharks['Age'].str.contains('^$'), 'Age'] = 'Unknown Age'
df_sharks.loc[df_sharks['Age'].str.contains(r',|&'), 'Age'] = 'Unknown Age'
df_sharks.loc[df_sharks['Age'].str.contains(r'[a-zA-Z]', regex=True), 'Age'] = 'Unknown Age'
df_sharks['Age'] = df_sharks['Age'].str.rstrip('s')
df_sharks['Age'] = df_sharks['Age'].str.rstrip('\'')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('mid-')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('Ca. ')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('>')
df_sharks['Age'] = df_sharks['Age'].str.lstrip('Both ')
df_sharks['Age'] = df_sharks['Age'].str.rstrip('½')
df_sharks['Age'] = df_sharks['Age'].str.strip('?')
df_sharks['Age'] = df_sharks['Age'].str.strip()

In [38]:
or_clean = df_sharks.loc[:,'Age'].str.extract(r'(\d+)\ or')[0] #Cleans format: age or age and keeps the first age
df_sharks['Age'].update(or_clean)

In [39]:
to_clean = df_sharks.loc[:,'Age'].str.extract(r'(\d+)\ to')[0] #Cleans format: age to age and keeps the first age
df_sharks['Age'].update(to_clean)

In [40]:
df_sharks.loc[df_sharks['Injury'].isnull(), 'Injury'] = 'No details' #Replacing Nulls

In [41]:
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(?:Presumed\ )?(?:FATAL),?\ (.*)')[0] #Cleans format: FATAL 
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(.*?),?\ FATAL')[0]
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(FATAL)(\.\ .*)?')[0] 
df_sharks['Injury'].update(fatal)
fatal = df_sharks.loc[:,'Injury'].str.extract(r'(.*?)PROVOKED')[0] 
df_sharks['Injury'].update(fatal)

In [42]:
df_sharks.loc[df_sharks['Injury'].str.contains('^$'), 'Injury'] = 'No details'
df_sharks.loc[df_sharks['Injury'].str.contains(r'No injury|No Injury|no injury|no Injury', regex=True), 'Injury'] = 'No injury'
df_sharks.loc[df_sharks['Injury'].str.contains(r'FATAL|Survived', regex=True), 'Injury'] = 'No details'
df_sharks['Injury'] = df_sharks['Injury'].str.strip()

In [43]:
df_sharks.rename(columns={'Fatal (Y/N)':'Fatal'}, inplace=True)  #Cleaning column name
df_sharks['Fatal'].value_counts()

N          4315
Y          1552
UNKNOWN      94
 N            8
F             1
N             1
#VALUE!       1
n             1
Name: Fatal, dtype: int64

In [44]:
df_sharks.loc[df_sharks['Fatal'].isnull(), 'Fatal'] = 'UNKNOWN' #Replacing Nulls
df_sharks['Fatal'] = df_sharks['Fatal'].str.strip()
df_sharks.loc[df_sharks['Fatal'].str.contains(r'n', regex=True), 'Fatal'] = 'N'
df_sharks.loc[df_sharks['Fatal'].str.contains(r'F|#VALUE!', regex=True), 'Fatal'] = 'UNKNOWN'

In [45]:
df_sharks.loc[df_sharks['Time'].isnull(), 'Time'] = 'Unknown Time' #Replacing Nulls
df_sharks.loc[df_sharks['Time'].str.contains('8:04'), 'Time'] = '20h04'
df_sharks.loc[df_sharks['Time'].str.contains('15j45'), 'Time'] = '15h45'
df_sharks.loc[df_sharks['Time'].str.contains('06j00'), 'Time'] = '06h00'
df_sharks.loc[df_sharks['Time'].str.contains('[abcdefgijklmnopqrstuvwxyz]'), 'Time'] = 'Unknown Time'

In [46]:
hours = df_sharks.loc[:,'Time'].str.extract(r'(\d{2}h\d{2})')[0] # Extracts hours from other strings and keeps them 
df_sharks['Time'].update(hours)

In [47]:
df_sharks.rename(columns={'Species ':'Species'}, inplace=True)  #Cleaning column name
# df_sharks['Species'].value_counts()

White shark                                                                                                                      161
Shark involvement not confirmed                                                                                                   80
Tiger shark                                                                                                                       68
Bull shark                                                                                                                        52
6' shark                                                                                                                          40
4' shark                                                                                                                          39
1.8 m [6'] shark                                                                                                                  35
1.5 m [5'] shark                                                     

In [48]:
df_sharks.loc[df_sharks['Species'].isnull(), 'Species'] = 'Unknown Species' #Replacing Nulls
df_sharks['Species'] = df_sharks['Species'].str.strip().str.lower()
species = df_sharks.loc[:,'Species'].str.extract(r'\ ?(\w+\"?\ shark)')[0] # Looks for ____ shark
df_sharks['Species'].update(species)
df_sharks.loc[df_sharks['Species'].str.contains(r'cat|canic', regex=True), 'Species'] = 'catshark'
df_sharks.loc[df_sharks['Species'].str.contains(r'gange', regex=True), 'Species'] = 'gangeticus'
df_sharks.loc[df_sharks['Species'].str.contains(r'iden|vici|confirm|^$', regex=True), 'Species'] = 'unknown species'
df_sharks.loc[df_sharks['Species'].str.contains(r'black', regex=True), 'Species'] = 'blacktip shark'
df_sharks.loc[df_sharks['Species'].str.contains(r'\d|m\ shark|lb\ shark|no\ shark|kg\ shark|of\ shark|colored\ shark|a\ shark|juvenile|two|from|small|involv|as\ shark|several|larg|hull|little|later|skin|bite|girth|for\ shark|sting|unknown', regex=True), 'Species'] = 'unknown species'
df_sharks['Species'] = df_sharks.loc[:,'Species'].str.replace(r'\"', '',regex=True)

In [49]:
df_sharks.rename(columns={'Investigator or Source':'Source'}, inplace=True)  #Cleaning column name
#df_sharks['Source'].value_counts()

C. Moore, GSAF                                                                                                              94
S. Petersohn, GSAF                                                                                                          82
C. Creswell, GSAF                                                                                                           81
R. Collier                                                                                                                  54
T. Peake, GSAF                                                                                                              48
M. Levine, GSAF                                                                                                             45
R. Collier, GSAF                                                                                                            35
A. Gifford, GSAF                                                                                               

In [50]:
df_sharks['pdf'] = df_sharks['pdf'].str.strip()
#df_sharks['pdf'].value_counts()

1907.10.16.R-HongKong.pdf                       2
1916.12.08.a-b-German.pdf                       2
1929.03.04.a-b.Roads-Aldridge.pdf               2
1898.00.00.R-Syria.pdf                          2
1923.00.00.a-NJ fisherman.pdf                   2
1931.09.21.a-b-Holaday-Barrows.pdf              2
1921.11.27.a-b-Jack.pdf                         2
1906.09.27.R.a&b-Munich-Swede.pdf               2
1934.12.23.a-b-Inman.pdf                        2
1935.06.05.R-SolomonIslands.pdf                 2
1916.07.12.a-b-Stillwell-Fisher.pdf             2
1957.00.00.g-Bolster.pdf                        1
1957.00.00.h-boat-Portuondo.pdf                 1
1957.00.00.f-Kline.pdf                          1
1957.00.00.i-baby                               1
1957.00.00.j-NV-Nauth.pdf                       1
1957.00.00.k-Fernando.pdf                       1
2016.09.18.c-NSB.pdf                            1
1957.01.05-Williams.pdf                         1
1957.02.00-Straughan.pdf                        1


In [51]:
df_sharks.rename(columns={'href formula':'href_formula'}, inplace=True)
df_sharks.loc[3019,'href_formula'] = df_sharks.loc[3019,'href'] #Only NaN in formula that is fine on href

In [53]:
diff_href = df_sharks[df_sharks['href'] != df_sharks['href_formula']]
diff_href[['href_formula','href']]

Unnamed: 0,href_formula,href
20,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.29-Spain.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.29-Spain.pdf
27,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23.a-Cutbirth.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.07.23-Cutbirth.pdf
61,http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.a-Girl.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2016.05.21.a-Girl.pdf
107,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.12.21.a-Brazil.pdf
114,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.11.15.a-Engelman.pdf
134,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.10.05.a-Slaughter.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.10.05-Slaughter.pdf
180,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.06.27.a-Swanepoel.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.06.27-Swanepoel.pdf
193,http://sharkattackfile.net/spreadsheets/pdf_directory/Court Case pending,
232,http://sharkattackfile.net/spreadsheets/pdf_directory/2015.01.24-Murray.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2015.01.24-Murray.pdf
262,http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.19-Tara.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/http://sharkattackfile.net/spreadsheets/pdf_directory/2014.10.19-Tara.pdf


In [55]:
df_sharks.drop(labels=['href'], axis = 1, inplace=True) # Almost identical to href_formula

In [76]:
stop = ['No date, Before','Reported','Circa','Before','Early','Letter dated','No date','before','Before','Summer','Mid']
pat = r'\b(?:{})\b'.format('|'.join(stop))

In [77]:
df_sharks.Date = df_sharks.Date.str.replace(pat, '', regex=True)

In [80]:
df_sharks.sample(50)

Unnamed: 0,case_number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Time,Species,Source,pdf,href_formula,original order
1225,2006.07.31.R,31-Jul-06,2006,Provoked,USA,Kentucky,"Newport Aquarium, Newport",shark interaction,Unknown,Unknown Sex,Unknown Age,"Minor injuries, similar to paper cuts from the captive sharks",N,Unknown Time,catshark,"Cincinatti News, 7/31/2006",2006.07.31.R-NewportAquarium.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2006.07.31.R-NewportAquarium.pdf,4768
1391,2005.01.19,19-Jan-05,2005,Provoked,AUSTRALIA,Victoria,Port Phillip Bay,fishing,Julian McLaughlin,M,Unknown Age,No injury,N,Unknown Time,unknown species,Herald Sun News,2005.01.19-McLaughlin.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2005.01.19-McLaughlin.pdf,4602
842,2009.12.16,16-Dec-09,2009,Unprovoked,NEW ZEALAND,South Island,Clark Island,swimming,Maurice Bede Philips,M,24,No details,Y,Unknown Time,white shark,"R.D. Weeks, GSAF",2009.12.16-Philips.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2009.12.16-Philips.pdf,5151
1193,2006.11.25,25-Nov-06,2006,Sea Disaster,PHILIPPINES,Surigao del Norte,"Off Bilisan Point, Hinatuarn Island",sea disaster,Sinking of the m.v.Leonida,Unknown Sex,Unknown Age,15 perished but shark involvement prior to death was not confirmed,Y,14h20,unknown species,"Manila Bulletin Online, 11/27/2006",2006.11.25-Leonida.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2006.11.25-Leonida.pdf,4800
1541,2003.06.24.c,24-Jun-03,2003,Unprovoked,BRAZIL,Pernambuco,"Piedade, Recife",unspecified activity,Moses Nunes de Albuquerque Junior,M,Unknown Age,No details,Y,Unknown Time,unknown species,"JCOnline, 6/23/2012",2003.06.24.c-MosesAlbuquerque.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.06.24.c-MosesAlbuquerque.pdf,4452
603,2012.01.13,13-Jan-12,2012,Unprovoked,USA,Oregon,"Lincoln City, Lincoln County",surfing,Steve Harnack,M,53,No injury,N,Unknown Time,white shark,R. Collier,2012.01.13-Harnack.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2012.01.13-Harnack.pdf,5390
1340,2005.07.27,27-Jul-05,2005,Unprovoked,USA,Florida,"Off Zelda Boulevard, Daytona Beach, Volusia Countyy",wading,Nicole Carlos,F,13,Laceration on the back of left hand & toothmarks on wrist,N,18h00,unknown species,"D. Salamone, GSAF",2005.07.27-Carlos.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2005.07.27-Carlos.pdf,4653
2323,1992.06.28.a,28-Jun-92,1992,Unprovoked,REUNION,Saint-Paul,Cap de la Marianne,surfing,Theirry Mercredi,M,16,No details,Y,14h30,bull shark,G. Van Grevelynghe,1992.06.28.a-Mercredi.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1992.06.28.a-Mercredi.pdf,3671
1926,1998.11.14,14-Nov-98,1998,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",surfing,Larry Foor,M,14,Right foot bitten,N,08h45,unknown species,"S. Petersohn, GSAF; D. Catron, Orlando Sentinel, 11/19/1998, p.D1; Daytona Beach News Journal, 11/15/1998, p.4C",1998.11.14-LarryFoor.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1998.11.14-LarryFoor.pdf,4067
4747,1932.10.31,31-Oct-32,1932,Unprovoked,AUSTRALIA,New South Wales,"Redhead Beach, Newcastle",swimming,Reginald Ogilvie,M,24,"Torso bitten with pneumothorax, slight lacerations on left hand",N,11h00,nurse shark,"V.M. Coppleson.N19. (1933); V.M. Coppleson (1958), pp.80 & 232",1932.10.31-Ogilvie.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1932.10.31-Ogilvie.pdf,1246


In [None]:
df_sharks.info()

### Output

In [81]:
df_sharks.to_csv(os.path.dirname(os.path.abspath('__file__'))+'\\'+'Output_File.csv')