# 100 Years of Global Shark Attack 

In [472]:
# Librerías necesarias
import pandas as pd
import numpy as np
import requests
from geopy.geocoders import Nominatim
from datetime import datetime
from meteostat import Point, Daily

Utilizo la Api de public.opendatasoft Shark Attack, en la que hay un registro de todos los ataques mundiales hechos por tiburones.

Investigando en la documentación he descubierto que puedo sacar de una soloa vez hasta 100 registros, y que con el parametro offset, puedo poner el indice desde donde quiero empezar esos 100 registros.

Como veo tambien en la documentación hay 6890 registros, decido sacar todos los registros haciendo un bucle for con un range de 0 hasta 6891 y contando de 100 en 100, así obtengo todos los registros y los meto en el dataframe "df_shark"

In [219]:
lista_sharks = []
for i in range(0,6891,100):
    respuesta = requests.get(f"https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/global-shark-attack/records?limit=100&offset={i}")
    lista_sharks += respuesta.json()['results']

In [480]:
df_sharks = pd.DataFrame(lista_sharks)
df_sharks.shape

(6890, 21)

In [481]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6890 entries, 0 to 6889
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   date                    6587 non-null   object
 1   year                    6758 non-null   object
 2   type                    6871 non-null   object
 3   country                 6839 non-null   object
 4   area                    6409 non-null   object
 5   location                6325 non-null   object
 6   activity                6304 non-null   object
 7   name                    6670 non-null   object
 8   sex                     6318 non-null   object
 9   age                     3908 non-null   object
 10  injury                  6853 non-null   object
 11  fatal_y_n               6890 non-null   object
 12  time                    3372 non-null   object
 13  species                 3772 non-null   object
 14  investigator_or_source  6869 non-null   object
 15  pdf 

Por ahora veo que las últimas 7 columnas no son necesarias para mi investigación a si que las eliminamos

In [482]:
df_sharks = df_sharks.iloc[:, :14]


## Columna a columna

Ya que quiro quedarme solo con los ataques hechos desde el 1925 hasta la actualidad o hasta el último ataque reportado, compruebo que la columna "year" no es una columna numerica, pero no puedo ya que tiene 132 registros None. 

Le echo un vistazo a esos registros y veo que no tienen nigún tipo de dato ni de fecha, "year" o "date", y como representa una porción tan pequeña en comparación con el dataframe al completo decido desacerme de ellas.

In [483]:
df_sharks[df_sharks['year'].isnull()]

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_y_n,time,species
248,,,Unprovoked,USA,Florida,"Mosquito Inlet (Ponce Inlet), Volusia County",Canoeing,male,M,,FATAL,Y,,
249,,,Sea Disaster,PAPUA NEW GUINEA,Madang Province,Off Lae,"Aircraft ditched in the sea, swimming ashore",male,M,,Shark bumped him,N,,
250,,,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Fishing,a native fisherman,M,,"FATAL, body not recovered but shark was caught...",Y,,
502,,,Unprovoked,AUSTRALIA,Torres Strait,,Diving for trochus,male,M,,Calf removed,N,,0.9 m [3'] shark
503,,,Unprovoked,AUSTRALIA,,,Fishing,boy,M,,"FATAL, knocked overboard by tail of shark & ca...",Y,,Blue pointer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,,,Sea Disaster,GREECE,Off Thessaly,,Shipwrecked Persian Fleet,males,M,,Herodotus tells of sharks attacking men in the...,Y,,
6886,,,Unprovoked,INDONESIA,Riau Province,"Natuna Islands, between Sumatra & Kalimantan i...",Swimming near anchored ship,a ship's engineer,M,,"FATAL, leg severed",Y,,
6887,,,Unprovoked,IRAQ,Basrah,Shatt-el Arab River near a small boat stand,Swimming,male,M,13 or 14,"FATAL, left leg bitten with severe blood loss",Y,Afternoon,Bull shark
6888,,,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Crew swimming alongside their anchored ship,male,M,,FATAL,Y,,


In [484]:
df_sharks.dropna(subset='year', inplace=True)

In [485]:
df_sharks['year'] = df_sharks['year'].astype(int) 
df_sharks.shape

(6758, 14)

Ahora con la columna "year" cambiada a numerica -int- nos quedamos solo con los ataques que fueron después del año 1924

In [486]:
df_sharks = df_sharks[df_sharks['year'] > 1924]
df_sharks.shape

(5750, 14)

Vamos a tratar la columna "date", para continuar teniendo el maximo de datos posibles vamos a rellenar los 109 datos faltantes con una fecha estandar compuesta por el año de la columna "year" y "-01-01".

Despues la pasaré a **datetime** por si acaso necesitaramos más adelante este tipo de dato.

In [487]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5750 entries, 0 to 6850
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       5644 non-null   object
 1   year       5750 non-null   int64 
 2   type       5735 non-null   object
 3   country    5726 non-null   object
 4   area       5432 non-null   object
 5   location   5379 non-null   object
 6   activity   5318 non-null   object
 7   name       5593 non-null   object
 8   sex        5259 non-null   object
 9   age        3724 non-null   object
 10  injury     5726 non-null   object
 11  fatal_y_n  5750 non-null   object
 12  time       3189 non-null   object
 13  species    3463 non-null   object
dtypes: int64(1), object(13)
memory usage: 673.8+ KB


In [488]:
mask = df_sharks['date'].isna()
df_sharks.loc[mask, 'date'] = df_sharks.loc[mask, 'year'].astype(str) + '-01-01'

In [489]:
df_sharks.loc[3184,'date'] = f'{df_sharks.loc[3184, 'year']}-01-01'
df_sharks.loc[3360,'date'] = f'{df_sharks.loc[3360, 'year']}-01-01'
df_sharks.loc[4439,'date'] = f'{df_sharks.loc[4439, 'year']}-01-01'

In [490]:
df_sharks['date'] = pd.to_datetime(df_sharks['date'], errors='coerce')
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5750 entries, 0 to 6850
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       5750 non-null   datetime64[ns]
 1   year       5750 non-null   int64         
 2   type       5735 non-null   object        
 3   country    5726 non-null   object        
 4   area       5432 non-null   object        
 5   location   5379 non-null   object        
 6   activity   5318 non-null   object        
 7   name       5593 non-null   object        
 8   sex        5259 non-null   object        
 9   age        3724 non-null   object        
 10  injury     5726 non-null   object        
 11  fatal_y_n  5750 non-null   object        
 12  time       3189 non-null   object        
 13  species    3463 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(12)
memory usage: 802.9+ KB


En la columna 'type' me encuentro con 15 registros nulos, 1 **Unverified**, 1 "?", 1 **Unconfirmed** y 1 **Under investigation**.
Creo que lo mejor va a ser imputarle a todos la etiqueta de **Unverified**.

In [492]:
df_sharks.fillna({'type':'Unverified'}, inplace=True)
df_sharks.loc[df_sharks['type'].isin(['?', 'Unconfirmed', 'Under investigation']),'type'] = 'Unverified'

In [493]:
df_sharks['type'].value_counts()

type
Unprovoked      4228
Provoked         566
Invalid          432
Watercraft       311
Sea Disaster     170
Unverified        19
Questionable      17
Boat               7
Name: count, dtype: int64

In [494]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5750 entries, 0 to 6850
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       5750 non-null   datetime64[ns]
 1   year       5750 non-null   int64         
 2   type       5750 non-null   object        
 3   country    5726 non-null   object        
 4   area       5432 non-null   object        
 5   location   5379 non-null   object        
 6   activity   5318 non-null   object        
 7   name       5593 non-null   object        
 8   sex        5259 non-null   object        
 9   age        3724 non-null   object        
 10  injury     5726 non-null   object        
 11  fatal_y_n  5750 non-null   object        
 12  time       3189 non-null   object        
 13  species    3463 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(12)
memory usage: 802.9+ KB


Columna country, parece que tiene 24 nulos, investigaré un poco esos registros y vere si puedo imputarle algún Pais o tengo que eliminar esos registros.

Al investigar un poco vemos que la gran mayoría de los registros faltantes son debido a que el ataque ha sido en aguas internacionales o en el mar entre dos paises.

Imputaré uno de los dos paises a los que tengan dos, basandonos en los registros mayoritarios que si lo tengan.
Imputaré un país cercano al agua internacional, basandonos en los registros mayoritarios que si lo tengan.
Eliminaré los registros que no arrojen ninguna pista sobre el pais del ataque.

In [495]:
for i, j in df_sharks['country'].value_counts().items():
    print(f'{j} -> {i}')

2265 -> USA
1182 -> AUSTRALIA
545 -> SOUTH AFRICA
131 -> PAPUA NEW GUINEA
122 -> BAHAMAS
118 -> BRAZIL
100 -> NEW ZEALAND
79 -> MEXICO
60 -> NEW CALEDONIA
56 -> REUNION
54 -> ITALY
53 -> FIJI
44 -> PHILIPPINES
40 -> MOZAMBIQUE
37 -> EGYPT
33 -> FRENCH POLYNESIA
32 -> JAPAN
28 -> SPAIN
25 -> IRAN
25 -> CUBA
24 -> HONG KONG
24 -> SOLOMON ISLANDS
22 -> PANAMA
17 -> CROATIA
16 -> TONGA
16 -> COSTA RICA
16 -> JAMAICA
15 -> PACIFIC OCEAN
14 -> GREECE
14 -> BERMUDA
13 -> INDONESIA
12 -> ENGLAND
12 -> THAILAND
11 -> UNITED KINGDOM
11 -> VIETNAM
11 -> VANUATU
11 -> ATLANTIC OCEAN
11 -> ECUADOR
11 -> INDIA
11 -> TURKEY
11 -> SENEGAL
11 -> MARSHALL ISLANDS
10 -> COLUMBIA
9 -> VENEZUELA
9 -> TAIWAN
9 -> NEW GUINEA
8 -> TANZANIA
8 -> SOUTH ATLANTIC OCEAN
8 -> ISRAEL
8 -> KENYA
8 -> CHILE
8 -> SOUTH KOREA
8 -> SRI LANKA
8 -> MAURITIUS
7 -> CANADA
7 -> SCOTLAND
7 -> CARIBBEAN SEA
7 -> DOMINICAN REPUBLIC
7 -> SEYCHELLES
7 -> IRAQ
6 -> CHINA
6 -> SAMOA
6 -> SOMALIA
6 -> NORTH PACIFIC OCEAN
6 -> NEW BRI

In [542]:
df_sharks.loc[5836, 'country'] = 'USA'

In [540]:
df_sharks.drop(5099, axis = 0, inplace=True)

In [543]:
df_sharks[df_sharks['country'].isnull()]

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_y_n,time,species


Una vez resuelto el tema de los datos vacíos ahora voy a pasar todos los strings a mayusculas ya que la gran mayoría están en mayusculas excepto algunos campos, y eso hace que al agrupar por pais algunos paises se queden fuera de sus grupos.

In [544]:
df_sharks['country'] = df_sharks['country'].apply(lambda x: x.upper())

In [545]:
df_sharks.sample(6)

Unnamed: 0,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_y_n,time,species
360,1998-11-05,1998,Unprovoked,USA,Oregon,Winchester Bay,Surfing,Dale Inskeep,M,32.0,No injury,N,,"White shark, 5 m to 6 m [16.5' to 20']"
1027,2012-09-16,2012,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Surfing,male,M,52.0,Foot bitten,N,15h00,
2493,2014-12-03,2014,Provoked,SPAIN,Granada,Off Motril,Fishing for blue sharks,male,M,,Glancing bite to wrist from netted shark PROVO...,N,07h00,
6740,1996-10-03,1996,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Surfing,Aric Hollingsworth,M,21.0,4 laceration on left forearm,N,08h30,1.2 m [4'] shark
4793,1983-05-24,1983,Unprovoked,USA,Florida,"Riviera Beach, Palm Beach County",Surfing,Dave Coulter,M,15.0,"Knocked off board by shark, no injury",N,,
4489,2007-09-20,2007,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Tyler Robertson,M,23.0,Small lacerations to bottom of right big toe,N,18h20,3' shark


En la columna de 'area' hay 308 registros None, a los que imputaremos el valor de la columna 'location' si existe y si no el valor de la columna 'country'

In [546]:
df_sharks.loc[df_sharks['area'].isna(), 'area'] = df_sharks.loc[df_sharks['area'].isna(), 'location']

In [547]:
df_sharks.loc[df_sharks['area'].isna(), 'area'] = df_sharks.loc[df_sharks['area'].isna(), 'country']

In [548]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5738 entries, 0 to 6850
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       5738 non-null   datetime64[ns]
 1   year       5738 non-null   int64         
 2   type       5738 non-null   object        
 3   country    5738 non-null   object        
 4   area       5738 non-null   object        
 5   location   5379 non-null   object        
 6   activity   5310 non-null   object        
 7   name       5581 non-null   object        
 8   sex        5247 non-null   object        
 9   age        3719 non-null   object        
 10  injury     5714 non-null   object        
 11  fatal_y_n  5738 non-null   object        
 12  time       3187 non-null   object        
 13  species    3458 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(12)
memory usage: 801.5+ KB


Aunuque no creo que vaya a aprovechar mucho las columnas 'area', 'location' y 'activity' pero las vamos a tratar y rellenar sus datos faltantes.

En el caso de 'location' voy a imputarle a los valores faltantes el valor de 'area'.

In [549]:
df_sharks.loc[df_sharks['location'].isna(), 'location'] = df_sharks.loc[df_sharks['location'].isna(), 'area']

En la columna activity, habia 1238 tipos diferentes de actividades, para reducir esa clasificación las voy a juntar con una función creada por mi, pasando a tener 37, uniendo en 'No details' las que no contengan ninguna información y en 'Random Activity' las que no tienen menos de 5 representaciones en el dataset y no se pueden agrupar en ningún otro tipo. Todo esto guardado en una nueva columna llamada 'activity2'

In [550]:
len(df_sharks['activity'].value_counts())

1236

In [556]:
len(df_sharks['activity2'].value_counts())

37

In [552]:
def categorizar_activity(x):
    for j in ['Surfing', 'Swimming', 'Fishing', 'Spearfishing', 'Wading' 'Snorkeling', 'Diving', 'Standing', 'Scuba diving', 'Body boarding', 
    'Body surfing', 'Boogie boarding', 'Kayaking', 'Bathing', 'Treading water', 'Free diving', 'Surf skiing', 'Windsurfing', 'Pearl diving', 
    'Walking', 'Floating', 'Playing', 'Surf-skiing', 'Rowing', 'Canoeing', 'Paddle boarding', 'Snorkeling', 'Feeding', 'Paddling', 'Filming',
    'Kite Boarding', 'Wading', 'Washing', 'Paddleboarding', 'Escaping from Alacatraz', 'Fell', 'Boat', 'Murder', 'Sculling', 'Sailing', 'Collecting', 'No details',
    'Sea disaster', 'Sitting on surfboard', 'Jumping']:
        
        if x == None:
            return 'No details'
        elif j.lower() in x.lower():
            return j
        

In [553]:
df_sharks['activity2'] = df_sharks['activity'].apply(categorizar_activity)

In [554]:
df_sharks['activity2']

0        Surfing
1       Swimming
2        Fishing
3        Surfing
4       Swimming
          ...   
6846        Fell
6847        None
6848     Surfing
6849        Fell
6850        None
Name: activity2, Length: 5738, dtype: object

In [555]:
for i, j in df_sharks['activity2'].value_counts().items():
    print(i, j)

Surfing 1267
Fishing 1132
Swimming 945
Diving 467
No details 429
Wading 163
Snorkeling 132
Standing 129
Body boarding 70
Boogie boarding 61
Boat 49
Bathing 45
Kayaking 42
Floating 38
Treading water 37
Playing 31
Surf skiing 24
Fell 23
Walking 22
Rowing 21
Sea disaster 19
Feeding 17
Paddling 17
Canoeing 13
Surf-skiing 13
Paddleboarding 12
Paddle boarding 12
Collecting 11
Sitting on surfboard 9
Washing 7
Jumping 6
Murder 6
Filming 6
Sailing 6
Kite Boarding 4
Escaping from Alacatraz 3
Sculling 3


In [557]:
df_sharks['activity2'] = df_sharks['activity2'].fillna('Random Activity')

In [558]:
df_sharks.loc[df_sharks['activity2'] == 'Paddleboarding', 'activity2'] = 'Paddle boarding'

In [559]:
print(len(df_sharks[df_sharks['activity2'].isna()]))
for i, j in df_sharks.loc[df_sharks['activity2'].isna(), ['year', 'activity']].value_counts().items():
    print(i, j)

0


La columna 'name' tiene 157 nulos que rellenaremos con 'Unknow'

In [561]:
df_sharks.fillna({'name':'Unknow'}, inplace=True)

La columna 'sex' tiene 491 registros vacios, voy a investigar un poco pero seguramente le imputare de manera aleatoria el porcentaje de hombres y mujeres que tiene el resto del dataset. 

In [564]:
df_sharks.loc[(df_sharks['sex'].isna()) & df_sharks['name'].str.contains('boat')]['sex'] = 'Boat'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [565]:
total_size = df_sharks[df_sharks['sex'].isna()].shape[0]
porcen_man = df_sharks['sex'].value_counts(normalize=True)['M']
porcen_woman =  df_sharks['sex'].value_counts(normalize=True)['F']
man_size = round(total_size * porcen_man)
woman_size = round(total_size * porcen_woman)

In [566]:
df_sharks.loc[df_sharks['sex'].isna(), 'sex'].sample(total_size).iloc[:man_size + 1] = 'M'
df_sharks.loc[df_sharks['sex'].isna(), 'sex'] = 'F'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [567]:
df_sharks.iloc[132]

date                   1984-12-03 00:00:00
year                                  1984
type                            Watercraft
country                              ITALY
area                        Tyrrhenian Sea
location     Marciana Marina, Isola d'Elba
activity                              Boat
name                                Unknow
sex                                      F
age                                   None
injury                           No injury
fatal_y_n                                N
time                                  None
species                        White shark
activity2                             Boat
Name: 132, dtype: object

In [568]:
df_sharks['sex'].value_counts(normalize=True)


sex
M        0.791565
F        0.207912
N        0.000174
M x 2    0.000174
lli      0.000174
Name: proportion, dtype: float64

Las columnas 'age' y 'species' tienen un porcentaje muy alto de missings pero las imputaré 'Unknow' para saber de las que si tenemos el dato, que caracteristicas y tipos tenemos.

In [569]:
df_sharks.fillna({'age':'Unknow'}, inplace=True)
df_sharks.fillna({'species':'Unknow'}, inplace=True)

La columna 'injury' es una columna que no nos va a dar mucha información ya que tiene una casoistica muy variada, asi que voy a proceder a borrarla.

In [570]:
df_sharks.drop(columns='injury',inplace=True, axis=1)

Y la columna 'time' no creo que nos haga falta a si que me desaré de ella.

In [571]:
df_sharks.drop(columns='time',inplace=True, axis=1)

In [572]:
df_sharks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5738 entries, 0 to 6850
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       5738 non-null   datetime64[ns]
 1   year       5738 non-null   int64         
 2   type       5738 non-null   object        
 3   country    5738 non-null   object        
 4   area       5738 non-null   object        
 5   location   5738 non-null   object        
 6   activity   5310 non-null   object        
 7   name       5738 non-null   object        
 8   sex        5738 non-null   object        
 9   age        5738 non-null   object        
 10  fatal_y_n  5738 non-null   object        
 11  species    5738 non-null   object        
 12  activity2  5738 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(11)
memory usage: 756.6+ KB


In [573]:
df_sharks.to_csv('shark_attack_clean.csv')

In [None]:
from geopy.geocoders import Nominatim
from datetime import datetime
from meteostat import Point, Daily

geolocator = Nominatim(user_agent="geoapi")
location = geolocator.geocode("Yallingup, Busselton, AUSTRALIA")
location.latitude, location.longitude

(-33.6395258, 115.0259595)

In [None]:


geolocator = Nominatim(user_agent="geoapi")
location = geolocator.geocode("Elliston, Australia")

# Set time period
start = datetime(2018, 4, 1)
end = datetime(2018, 4, 1)

In [None]:
def buscar_datos_meteo_cercanos(lat, lon, start, end, pasos=10, offset=0.05):
    """
    Busca datos meteorológicos moviendo ligeramente las coordenadas hasta encontrar datos válidos.
    
    :param lat: Latitud inicial
    :param lon: Longitud inicial
    :param start: Fecha de inicio (datetime)
    :param end: Fecha de fin (datetime)
    :param pasos: Número de pasos en cada dirección
    :param offset: Tamaño de cada desplazamiento (en grados)
    :return: (dataframe de meteostat, coordenadas usadas) o (None, None) si falla
    """
    for i in range(-pasos, pasos + 1):
        for j in range(-pasos, pasos + 1):
            lat_nueva = lat + i * offset
            lon_nueva = lon + j * offset
            punto = Point(lat_nueva, lon_nueva)

            try:
                data = Daily(punto, start, end).fetch()
                if not data.empty:
                    print(f"Datos encontrados en ({lat_nueva:.4f}, {lon_nueva:.4f})")
                    return data, (lat_nueva, lon_nueva)
            except Exception as e:
                continue  # Ignorar errores silenciosamente

    print("No se encontraron datos meteorológicos cercanos.")
    return None, None

In [None]:
x = buscar_datos_meteo_cercanos(location.latitude, location.longitude, start, end)



Datos encontrados en (-33.9402, 134.8849)


In [None]:
x[0].loc[:,'tmax'].iloc[0]

np.float64(25.6)