In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv('../attacks.csv')
df.head()
# numero de columnas y filas del dataframe original
df.shape

(5992, 24)

In [3]:
# numero total de valores nulos
df.isnull().sum().sum()

23109

In [4]:
# numero de valores nulos por columnas
null_cols = df.isnull().sum()
null_cols[null_cols > 0]

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [5]:
# Ver el numero de valores no nulos por columna
not_null_cols = df.notnull()
not_null_cols.sum()

Case Number               5992
Date                      5992
Year                      5992
Type                      5992
Country                   5949
Area                      5590
Location                  5496
Activity                  5465
Name                      5792
Sex                       5425
Age                       3311
Injury                    5965
Fatal (Y/N)               5973
Time                      2779
Species                   3058
Investigator or Source    5977
pdf                       5992
href formula              5991
href                      5989
Case Number.1             5992
Case Number.2             5992
original order            5992
Unnamed: 22                  1
Unnamed: 23                  2
dtype: int64

In [6]:
# Renombrar columnas
df = df.rename(columns={"Unnamed: 22":"Unnamed1"})
df = df.rename(columns={"Unnamed: 23":"Unnamed2"})
# Ver cuantos valores no nulos hay en las columnas Unnamed1 y Unnamed2
unnamed_not_null_1 = df[df.Unnamed1.notnull()]
unnamed_not_null_2 = df[df.Unnamed2.notnull()]
unnamed_not_null_1.count()

Case Number               1
Date                      1
Year                      1
Type                      1
Country                   1
Area                      1
Location                  1
Activity                  1
Name                      1
Sex                       1
Age                       1
Injury                    1
Fatal (Y/N)               1
Time                      1
Species                   0
Investigator or Source    1
pdf                       1
href formula              1
href                      1
Case Number.1             1
Case Number.2             1
original order            1
Unnamed1                  1
Unnamed2                  0
dtype: int64

In [7]:
# Elimina Unnamed1 y Unnamed 2 por irrelevantes basado en el hecho de que solo tenian 1 o 2 valores no nulos
df = df.drop("Unnamed1", axis=1)
df = df.drop("Unnamed2", axis=1)

In [8]:
df = df.drop(["href", "href formula", "pdf", "Date", "Sex ", "Investigator or Source", "Case Number", "Case Number.1", "Case Number.2", "Time", "Location", "original order", "Injury", "Name", "Age", "Area", "Year", "Country", ], axis=1)

In [9]:
not_null_cols = df.notnull()
not_null_cols.sum()

Type           5992
Activity       5465
Fatal (Y/N)    5973
Species        3058
dtype: int64

In [10]:
df = df.rename(columns={"Fatal (Y/N)":"Fatal"})

In [11]:
df = df.rename(columns={"Species ":"Species"})

In [12]:
df["Fatal"] = df["Fatal"].astype(str)
fatal_col_nulls = df["Fatal"].isnull().sum()
fatal_col_not_nulls = df["Fatal"].notnull().sum()

print("Fatal total null values:", fatal_col_nulls)
print("Fatal total not null values:", fatal_col_not_nulls)

Fatal total null values: 0
Fatal total not null values: 5992


In [13]:
fatal_set = set(df["Fatal"])
print(fatal_set)

{'Y', 'n', 'UNKNOWN', ' N', 'N', 'N ', '#VALUE!', 'F', 'nan'}


In [14]:
df["Species"].value_counts()
species_set = set(df["Species"])
species_set
df["Species"] = df["Species"].astype(str)

In [15]:
df = df[df.Species != "Shark involvement not confirmed"]
df = df[df.Species != "nan"]
df = df[df.Species != "Lesser spotted dogfish, Scyliorhinus canicula, less than 80 cm in length"]
df = df[df.Species != "NaN"]
df["Species"].value_counts()

White shark                                                             161
Tiger shark                                                              68
Bull shark                                                               52
6' shark                                                                 40
4' shark                                                                 39
                                                                       ... 
1.2 m [4'] hammerhead shark                                               1
3.7 m [12'] sharks                                                        1
2.4 m [8'] white shark, species identity confirmed by tooth fragment      1
2.7 m [9'] shark with black-tipped pectoral fins                          1
White shark, 3.8 m [12.5']                                                1
Name: Species, Length: 1536, dtype: int64

In [16]:
df["Fatal"].value_counts()

N          2473
Y           471
UNKNOWN      21
nan           7
 N            5
Name: Fatal, dtype: int64

In [17]:
df["Fatal"] = df["Fatal"].str.replace(" N","N")
df["Fatal"] = df["Fatal"].str.replace("N ","N")
df["Fatal"] = df["Fatal"].str.replace("n","N")

In [18]:
df = df[df.Fatal != "UNKNOWN"]
df = df[df.Fatal != "nan"]
df = df[df.Fatal != "F"]
df = df[df.Fatal != "#VALUE!"]
df = df[df.Fatal != "NaN"]

In [19]:
df["Fatal"].value_counts()

N    2478
Y     471
Name: Fatal, dtype: int64

In [20]:
df["Fatal"].describe()

count     2949
unique       2
top          N
freq      2478
Name: Fatal, dtype: object

In [21]:
df

Unnamed: 0,Type,Activity,Fatal,Species
4,Unprovoked,Surfing,N,2 m shark
6,Unprovoked,Wading,N,3' to 4' shark
7,Unprovoked,Swimming,N,"Tiger shark, 10?"
12,Unprovoked,Spearfishing,N,White shark
13,Unprovoked,Surfing,N,"Bull shark, 6'"
...,...,...,...,...
5966,Unprovoked,Standing,Y,12' tiger shark
5983,Unprovoked,Fishing,Y,Blue pointer
5984,Unprovoked,Fishing,Y,Blue pointer
5985,Unprovoked,Fishing,Y,Blue pointers


In [22]:
df["Type"].value_counts()

Unprovoked      2151
Provoked         395
Boat             161
Invalid          160
Boating           52
Sea Disaster      30
Name: Type, dtype: int64

In [23]:
df = df[df.Type != "Sea Disaster"]
df = df[df.Type != "Invalid"]
df = df[df.Type != "Boat"]
df = df[df.Type != "Boating"]
df["Type"].value_counts()

Unprovoked    2151
Provoked       395
Name: Type, dtype: int64

In [24]:
df["Species"].describe()

count            2546
unique           1323
top       White shark
freq              119
Name: Species, dtype: object

In [25]:
df["Species"].value_counts()

White shark                                              119
Tiger shark                                               59
Bull shark                                                50
4' shark                                                  39
6' shark                                                  39
                                                        ... 
White shark, 5.5 m to 6 m [18' to 20']                     1
Whitetip reef shark                                        1
5' to 7' shark                                             1
Bronze whaler shark, 3 m [10'], 200-lb                     1
Said to involve a 6 m to 7 m [20' to 23'] white shark      1
Name: Species, Length: 1323, dtype: int64

In [26]:
df["Species"] = df["Species"].str.replace("4.2 m white shark", "White shark")

In [27]:
species_set = set(df["Species"])
species_set

{'Thought to involve a Zambesi shark',
 '2 m hammerhead',
 '3 m, 600-kg shark',
 'Gray reef shark',
 "Whaler shark, 4 m [13'] ",
 "Sevengill shark, 14', was caught in the vicinity",
 "12' shark",
 "White shark, 3 m [10']rk",
 'Dusky shark, 3m',
 '4.3 m shark',
 'Hammerhead shark',
 '200 to 300 kg shark',
 '4.5 m & 5 m white shark ',
 "White shark, 3.5 m [11.5'], species identity confirmed by tooth fragment",
 '4.9 m white shark',
 "Tiger shark, 12' ?",
 "4.4 m [14'] shark",
 '1.8 to 2 m C. albimarginatus',
 "2.1 m to 2.4 m [7' to 8'] shark",
 "1.2 m [4'] bull shark",
 "Grey nurse shark, 2.6 m [8.5']  ",
 "7' shark",
 "White shark, 2.4 m [8']",
 'Wobbegong, 2m',
 'Wobbegong shark ',
 "1.8 m [6'] dogfish",
 "Bull shark, 5'",
 "Nurse shark, 1.8 m [6'] ",
 "2.7 m  [9'] shark",
 "1.5 m [5'] shark, probable bull shark",
 "White shark, 13'",
 "1.8 m [6'] shark, species identity questionable",
 "1.2 m to 1.5 m [4' to 5'] shark",
 "1.5 to 2 m [5' to 6.75'] shark",
 'Bull shark, 2m ',
 'Tiger sh

In [28]:
df.loc[df['Species'].str.startswith('Bull')] = 'Bull shark'
df.loc[df['Species'].str.startswith('Tiger')] = 'Tiger shark'
df.loc[df['Species'].str.startswith('White shark')] = 'White shark'
df.loc[df['Species'].str.startswith('Lemon')] = 'Lemon shark'
df.loc[df['Species'].str.startswith('Zambesi')] = 'Zambesi shark'
df.loc[df['Species'].str.startswith('Blue')] = 'Blue shark'
df.loc[df['Species'].str.startswith('\"sand\"')] = 'Sand shark'
df.loc[df['Species'].str.startswith(' Bull')] = 'Bull shark'
df.loc[df['Species'].str.startswith('Bronze')] = 'Bronze whaler shark'
df.loc[df['Species'].str.startswith('Hammerhead')] = 'Hammerhead shark'
df.loc[df['Species'].str.startswith('Raggedtooth')] = 'Raggedtooth shark'
df.loc[df['Species'].str.startswith(' Sandbar')] = 'Sandbar shark'
df.loc[df['Species'].str.startswith(' Sevengill')] = 'Sevengill shark'
df.loc[df['Species'].str.startswith(' Raggedtooth')] = 'Raggedtooth shark'
df.loc[df['Species'].str.startswith(' Tiger')] = 'Tiger shark'
df.loc[df['Species'].str.startswith(' White')] = 'White shark'
df.loc[df['Species'].str.startswith(' Blacktip')] = 'Blacktip shark'
df.loc[df['Species'].str.startswith(' Galapagos')] = 'Galapagos shark'
df.loc[df['Species'].str.startswith(' White')] = 'White shark'
df.loc[df['Species'].str.startswith('Carpet')] = 'Carpet shark'
df.loc[df['Species'].str.startswith('Caribbean')] = 'Caribbean reef shark'
df.loc[df['Species'].str.startswith('Copper')] = 'Copper shark'
df.loc[df['Species'].str.startswith('Dusky')] = 'Dusky shark'
df.loc[df['Species'].str.startswith('Grey reef')] = 'Grey reef shark'
df.loc[df['Species'].str.startswith('Grey nurse')] = 'Grey nurse shark'
df.loc[df['Species'].str.startswith('Oceanic')] = 'Oceanic whitetip shark'
df.loc[df['Species'].str.startswith('Wobbegong')] = 'Wobbegong whitetip shark'
df.loc[df['Species'].str.startswith('Zambesi')] = 'Zambesi shark'
df.loc[df['Species'].str.startswith('Unknown')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Whitetip')] = 'Whitetip reef shark'
df.loc[df['Species'].str.startswith('Whaler')] = 'Whaler shark'
df.loc[df['Species'].str.startswith('reef')] = 'Reef shark'
df.loc[df['Species'].str.startswith('Thought')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Spinner')] = 'Spinner shark'
df.loc[df['Species'].str.startswith('Tawny')] = 'Tawny shark'
df.loc[df['Species'].str.startswith('Mako')] = 'Mako shark'
df.loc[df['Species'].str.startswith('Nurse')] = 'Nurse shark'
df.loc[df['Species'].str.startswith('Reef')] = 'Reef shark'
df.loc[df['Species'].str.startswith('Sand')] = 'Sand shark'
df.loc[df['Species'].str.startswith('Unidentified')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Possibly')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Said')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Sevengill')] = 'Sevengill shark'
df.loc[df['Species'].str.startswith('Silky')] = 'Silky shark'
df.loc[df['Species'].str.startswith('Species')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Spurdog')] = 'Spurdog shark'
df.loc[df['Species'].str.startswith('Seven-gill')] = 'Sevengill shark'
df.loc[df['Species'].str.startswith('Reported')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Remains')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Shortfin')] = 'Mako shark'
df.loc[df['Species'].str.startswith('Zambezi')] = 'Zambesi shark'
df.loc[df['Species'].str.startswith('said')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('possibly')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('Though')] = 'Unidentified species'
df.loc[df['Species'].str.startswith('dogfish')] = 'Dogfish shark'
df.loc[df['Species'].str.startswith('bull')] = 'Bull shark'
df.loc[df['Species'].str.startswith('nurse')] = 'Nurse shark'
df.loc[df['Species'].str.startswith('sandtiger')] = 'Sandtiger shark'
df.loc[df['Species'].str.startswith('sand')] = 'Sand shark'
df.loc[df['Species'].str.startswith('Undefined')] = 'Unidentified species'


In [29]:
df['Species'].value_counts()

White shark                          332
Tiger shark                          198
Unidentified species                 128
Bull shark                           119
Bronze whaler shark                   55
                                    ... 
2 white shark: 13' & 9"8"              1
1.8 m [6'] grey reef shark             1
1.2 m [4'] shark (spinner shark?)      1
2.5' shark                             1
7' to 8' shark                         1
Name: Species, Length: 659, dtype: int64

In [30]:
df["Species"] = df["Species"].str.replace("1 m hammerhead shark", "Hammerhead shark")
df["Species"] = df["Species"].str.replace("1.5 m to 1.8 m [5' to 6']  blacktip shark", "Blacktip shark")
df["Species"] = df["Species"].str.replace("1.5 m to 1.8 m [5' to 6'] Zambesi shark", "Zambesi shark")
df["Species"] = df["Species"].str.replace("1.5 m to 1.8 m [5' to 6'] sandbar shark", "Sandbar shark")
df["Species"] = df["Species"].str.replace("\"a black-tipped shark\"", "Blacktip shark")
df["Species"] = df["Species"].str.replace("\"a blue shark\"", "Blue shark")
df["Species"] = df["Species"].str.replace("\"a dog shark\"", "Dog shark")
df["Species"] = df["Species"].str.replace("\"a little shark\"", "Unidentified species")
df["Species"] = df["Species"].str.replace("\"a large shark\"", "Unidentified species")
df["Species"] = df["Species"].str.replace("\"a small shark\"", "Unidentified species")
df["Species"] = df["Species"].str.replace("\"a young shark\"", "Unidentified species")
df["Species"] = df["Species"].str.replace("\"reef shark\"", "Reef shark")
df["Species"] = df["Species"].str.replace("\"gummy\" shark (Rhizoprionodon or Loxodon) 1.2 m [4\']", "Unidentified species")
df["Species"] = df["Species"].str.replace("\"black tipped\" shark", "Black-tipped shark")
df["Species"] = df['Species'].str.replace('\"gray shark\"', 'Grey shark')
df["Species"] = df['Species'].str.replace('\"grey shark\"', 'Grey shark')
df["Species"] = df['Species'].str.replace('  "gummy" shark (Rhizoprionodon or Loxodon) 1.2 m [4\']', 'Unidentified species')
df["Species"] = df['Species'].str.replace('Juvenile dusky or blacktip shark', 'Unidentified species')
df["Species"] = df['Species'].str.replace('Wobbegong shark?', 'Unidentified species')
df["Species"] = df['Species'].str.replace('Zambezi shark (tooth fragments recovered)',  'Zambesi shark')
df["Species"] = df['Species'].str.replace('Whtietip reef shark',  'Whitetip reef shark')

df["Species"] = df['Species'].str.replace('Undefined shark',  'Unidentified species')
df["Species"] = df['Species'].str.replace('Unidentified shark',  'Unidentified species')
df["Species"] = df['Species'].str.replace('Undefined species',  'Unidentified species')

df["Species"] = df['Species'].str.replace('a sand shark',  'Sand shark')
df["Species"] = df['Species'].str.replace('a small reef shark',  'Reef shark')
df["Species"] = df['Species'].str.replace('Provoked',  'a')
df["Species"] = df['Species'].str.replace('Unprovoked',  'a')


In [31]:
df["Species"] = df[df.Species != 'Tooth fragments of �whaler� shark were recovered, a bull shark, according to Edwards']
df["Species"] = df[df.Species != 'Provoked']
df["Species"] = df[df.Species != 'Unprovoked']
df["Species"] = df[df.Species != 'nan']

In [32]:
species_set2 = set(df["Species"])
species_set2

{'Blacktip shark',
 'Blue shark',
 'Bronze whaler shark',
 'Bull shark',
 'Caribbean reef shark',
 'Carpet shark',
 'Copper shark',
 'Dogfish shark',
 'Dusky shark',
 'Galapagos shark',
 'Grey nurse shark',
 'Grey reef shark',
 'Hammerhead shark',
 'Lemon shark',
 'Mako shark',
 'Nurse shark',
 'Oceanic whitetip shark',
 'Provoked',
 'Raggedtooth shark',
 'Reef shark',
 'Sand shark',
 'Sandtiger shark',
 'Sevengill shark',
 'Silky shark',
 'Spinner shark',
 'Spurdog shark',
 'Tawny shark',
 'Tiger shark',
 'Unidentified species',
 'Unprovoked',
 'Whaler shark',
 'White shark',
 'Whitetip reef shark',
 'Wobbegong whitetip shark',
 'Zambesi shark'}

In [None]:
Activity_set = set(df["Activity"])
Activity_set
df = df.drop("Activity", axis=1)

In [None]:
df["Type"].describe()

In [None]:
df["Fatal"].describe()

In [None]:
df

In [None]:
df.describe()

In [None]:
unprov_not_fatal = df[(df["Type"] == "Unprovoked") & (df["Fatal"] == "N")]

In [None]:
unprov_not_fatal.describe()

In [None]:
unprov_fatal = df[(df["Type"] == "Unprovoked") & (df["Fatal"] == "Y")]
unprov_fatal.describe()

In [None]:
prov_not_fatal = df[(df["Type"] == "Provoked") & (df["Fatal"] == "N")]
prov_not_fatal.describe()

In [None]:
prov_fatal = df[(df["Type"] == "Provoked") & (df["Fatal"] == "Y")]
prov_fatal.describe()

In [None]:
import matplotlib.pyplot as plt

# Data to plot
labels = 'Unprovoked not fatal', 'Unprovoked fatal', 'Provoked not fatal', 'Provoked fatal'
sizes = [1764, 381, 388, 6]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0, 0, 0, 0.1)  # explode 1st slice

# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=False, startangle=140)

plt.axis('equal')
plt.show()

In [None]:
# Basado en la data filtrada por categorias (ataques provocados o no provocados y ataque fatal o no fatal)
# podemos observar que de 2539 ataques 2145 fueron no provocados (84,5%) y respectivamente 
# 394 ataques fueron provocados (15.5%).

# De los ataques no provocados 1764 no fueron fatales (69.5%) y 381 fueron fatales (0.2%).
# De los ataques provocados 388 no fueron fatales (15.3%) y 6 fueron fatales (0.2%).

# Lo cual me basta para concluir que el tiburon 