In [3]:
# First lets import all the libraries we are gonna use,
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re


In [80]:
# Now lets open and define the DataFrame,
df = pd.read_csv('/mnt/c/Users/marcn/Documents/Ironhack/sharky/Data/attacks.csv', encoding="ISO-8859-1")
df.dropna(axis = 0, how = 'all', inplace = True)

# Let's make it a little more easy to work with by renaming columns:
new_dict = {i: i.strip().replace(" ", "_").lower() for i in df.columns}
df.rename(columns=new_dict, inplace=True)

In [81]:
# Once with our dataframe defined we can start cleaning the data, lets first do a little further exploration on it,
# Let's see what data contains information about the Year when the attack happened,
a = list(df.columns)
print(a)

['case_number', 'date', 'year', 'type', 'country', 'area', 'location', 'activity', 'name', 'sex', 'age', 'injury', 'fatal_(y/n)', 'time', 'species', 'investigator_or_source', 'pdf', 'href_formula', 'href', 'case_number.1', 'case_number.2', 'original_order', 'unnamed:_22', 'unnamed:_23']


In [82]:
# Let's check the nulls,
df.isnull().sum() # It is quite obvious we can drop the last 2 columns

case_number                  1
date                      2401
year                      2403
type                      2405
country                   2451
area                      2856
location                  2941
activity                  2945
name                      2611
sex                       2966
age                       5232
injury                    2429
fatal_(y/n)               2940
time                      5755
species                   5239
investigator_or_source    2418
pdf                       2401
href_formula              2402
href                      2401
case_number.1             2401
case_number.2             2401
original_order            2394
unnamed:_22               8702
unnamed:_23               8701
dtype: int64

In [83]:
df.drop(columns= ['unnamed:_22', 'unnamed:_23'], inplace=True) # We are dropping the unnecesary data

In [84]:
#some important observations:
""" We see that many cases are not dated, just labeld as before some year. We can't use this data, it is not precise.
Also there is some data that is labeled with a range between 2 years. The range is never higher than 20 years in the observed sample, so i am going
to use this data, eventhough it is not very precise it is bounded. """
df[df[['case_number','date','year']].notnull().all(1)].sample(3)


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
410,2015.06.19,19-Jun-2015,2015.0,Unprovoked,PUERTO RICO,,Off Cabo Rojo,Spearfishing,Benjamin Rios,M,...,N,Morning,,"Yahoo News, 6/19/2015",2015.06.19-Rios.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2015.06.19,2015.06.19,5893.0
481,2014.10.30,29-Oct-2014,2014.0,Provoked,AUSTRALIA,New South Wales,Wallabi Point,Surfing,Ryan Hunt,M,...,N,18h00,,"The Sydney Morning Herald, 10/30/2014",2014.10.29-Hunt.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2014.10.30,2014.10.30,5822.0
4008,1960.03.28,28-Mar-1960,1960.0,Unprovoked,GUAM,,,"Spearfishing, carrying fish on belt",Enrique Matao,M,...,N,12h00,1.2 m [4'] shark,"V.M. Coppleson (1962), p.254; H.D. Baldridge, ...",1960.03.28-Metao.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1960.03.28,1960.03.28,2295.0


In [87]:
df.loc[df['year'].isnull()].tail()


Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,...,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number.1,case_number.2,original_order
8698,0,,,,,,,,,,...,,,,,,,,,,
8699,0,,,,,,,,,,...,,,,,,,,,,
8700,0,,,,,,,,,,...,,,,,,,,,,
8701,0,,,,,,,,,,...,,,,,,,,,,
25722,xx,,,,,,,,,,...,,,,,,,,,,


### Cleaning by date,
The feature I want to focus on is the year of the attack, so what I am going to do is try to assing a year to each attack, if possible. First, I observed that 125 of the attacks are assigned year '0.0', but for most of the cases in the date column there is more information about the date, all that have a date with the string 'Before' in it are not going to count for the study, and the ones with a range in them are going to be assigned the date in the middle of the range. Also, I am going to take in account all the attacks that happened A.D, and drop everything B.C. 

The order of cleaning will go as follows:
1. First i am going locate all the data that has a NaN or a 0.0 in the Year column. And work with those.
2. After that, I am going to grop all the rows with Dates that have 'B.C' in them.
3. Now, the Dates that go as: '\d{4}-\d{4}' are going to be replaced with the year in between. 
4. And finally, for the rest with a NaN or a 0.0 in their year column, I am going to try and find a string with '\d{4}|\d{3}' in them, and replace the year value for the found value. 