In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv('../attacks.csv')
df.head()
# numero de columnas y filas del dataframe original
df.shape

(5992, 24)

In [3]:
# numero total de valores nulos
df.isnull().sum().sum()

23109

In [4]:
# numero de valores nulos por columnas
null_cols = df.isnull().sum()
null_cols[null_cols > 0]

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [5]:
# Ver el numero de valores no nulos por columna
not_null_cols = df.notnull()
not_null_cols.sum()

Case Number               5992
Date                      5992
Year                      5992
Type                      5992
Country                   5949
Area                      5590
Location                  5496
Activity                  5465
Name                      5792
Sex                       5425
Age                       3311
Injury                    5965
Fatal (Y/N)               5973
Time                      2779
Species                   3058
Investigator or Source    5977
pdf                       5992
href formula              5991
href                      5989
Case Number.1             5992
Case Number.2             5992
original order            5992
Unnamed: 22                  1
Unnamed: 23                  2
dtype: int64

In [6]:
# Renombrar columnas
df = df.rename(columns={"Unnamed: 22":"Unnamed1"})
df = df.rename(columns={"Unnamed: 23":"Unnamed2"})
# Ver cuantos valores no nulos hay en las columnas Unnamed1 y Unnamed2
unnamed_not_null_1 = df[df.Unnamed1.notnull()]
unnamed_not_null_2 = df[df.Unnamed2.notnull()]
unnamed_not_null_1.count()

Case Number               1
Date                      1
Year                      1
Type                      1
Country                   1
Area                      1
Location                  1
Activity                  1
Name                      1
Sex                       1
Age                       1
Injury                    1
Fatal (Y/N)               1
Time                      1
Species                   0
Investigator or Source    1
pdf                       1
href formula              1
href                      1
Case Number.1             1
Case Number.2             1
original order            1
Unnamed1                  1
Unnamed2                  0
dtype: int64

In [7]:
# Elimina Unnamed1 y Unnamed 2 por irrelevantes basado en el hecho de que solo tenian 1 o 2 valores no nulos
df = df.drop("Unnamed1", axis=1)
df = df.drop("Unnamed2", axis=1)

In [8]:
df = df.drop(["href", "href formula", "pdf", "Date", "Sex ", "Investigator or Source", "Case Number", "Case Number.1", "Case Number.2", "Time", "Location", "original order", "Injury", "Name", "Age", "Area", "Year", "Country", ], axis=1)

In [9]:
not_null_cols = df.notnull()
not_null_cols.sum()

Type           5992
Activity       5465
Fatal (Y/N)    5973
Species        3058
dtype: int64

In [10]:
df = df.rename(columns={"Fatal (Y/N)":"Fatal"})

In [11]:
df = df.rename(columns={"Species ":"Species"})

In [12]:
df["Fatal"] = df["Fatal"].astype(str)
fatal_col_nulls = df["Fatal"].isnull().sum()
fatal_col_not_nulls = df["Fatal"].notnull().sum()

print("Fatal total null values:", fatal_col_nulls)
print("Fatal total not null values:", fatal_col_not_nulls)

Fatal total null values: 0
Fatal total not null values: 5992


In [13]:
fatal_set = set(df["Fatal"])
print(fatal_set)

{'nan', 'N', 'F', ' N', 'N ', 'UNKNOWN', '#VALUE!', 'Y', 'n'}


In [14]:
df["Species"].value_counts()
species_set = set(df["Species"])
species_set
df["Species"] = df["Species"].astype(str)

In [15]:
df = df[df.Species != "Shark involvement not confirmed"]
df = df[df.Species != "nan"]
df = df[df.Species != "Lesser spotted dogfish, Scyliorhinus canicula, less than 80 cm in length"]
df = df[df.Species != "NaN"]
df["Species"].value_counts()

White shark                        161
Tiger shark                         68
Bull shark                          52
6' shark                            40
4' shark                            39
                                  ... 
18" to 36" shark                     1
3 m to 3.7 m [10' to 12'] shark      1
300- to 400-lb Zambesi shark         1
small sharks'                        1
20 kg shark                          1
Name: Species, Length: 1536, dtype: int64

In [16]:
df["Fatal"].value_counts()

N          2473
Y           471
UNKNOWN      21
nan           7
 N            5
Name: Fatal, dtype: int64

In [17]:
df["Fatal"] = df["Fatal"].str.replace(" N","N")
df["Fatal"] = df["Fatal"].str.replace("N ","N")
df["Fatal"] = df["Fatal"].str.replace("n","N")

In [18]:
df = df[df.Fatal != "UNKNOWN"]
df = df[df.Fatal != "nan"]
df = df[df.Fatal != "F"]
df = df[df.Fatal != "#VALUE!"]
df = df[df.Fatal != "NaN"]

In [19]:
df["Fatal"].value_counts()

N    2478
Y     471
Name: Fatal, dtype: int64

In [20]:
df["Fatal"].describe()

count     2949
unique       2
top          N
freq      2478
Name: Fatal, dtype: object

In [21]:
df

Unnamed: 0,Type,Activity,Fatal,Species
4,Unprovoked,Surfing,N,2 m shark
6,Unprovoked,Wading,N,3' to 4' shark
7,Unprovoked,Swimming,N,"Tiger shark, 10?"
12,Unprovoked,Spearfishing,N,White shark
13,Unprovoked,Surfing,N,"Bull shark, 6'"
...,...,...,...,...
5966,Unprovoked,Standing,Y,12' tiger shark
5983,Unprovoked,Fishing,Y,Blue pointer
5984,Unprovoked,Fishing,Y,Blue pointer
5985,Unprovoked,Fishing,Y,Blue pointers


In [22]:
df["Type"].value_counts()

Unprovoked      2151
Provoked         395
Boat             161
Invalid          160
Boating           52
Sea Disaster      30
Name: Type, dtype: int64

In [23]:
df = df[df.Type != "Sea Disaster"]
df = df[df.Type != "Invalid"]
df = df[df.Type != "Boat"]
df = df[df.Type != "Boating"]
df["Type"].value_counts()

Unprovoked    2151
Provoked       395
Name: Type, dtype: int64

In [24]:
df["Species"].describe()

count            2546
unique           1323
top       White shark
freq              119
Name: Species, dtype: object

In [25]:
df["Species"].value_counts()

White shark                     119
Tiger shark                      59
Bull shark                       50
4' shark                         39
6' shark                         39
                               ... 
"a blue shark"                    1
"sandshark"                       1
White shark or bronze whaler      1
Wobbegong, 2m                     1
20 kg shark                       1
Name: Species, Length: 1323, dtype: int64

In [26]:
df["Species"] = df["Species"].str.replace("4.2 m white shark", "White shark")

In [27]:
species_set = set(df["Species"])
species_set

{"Said to involve a 6 m to 7 m [20' to 23'] white shark",
 '"reef shark"',
 'White shark, 3m to 4m',
 'Bronze whaler shark, a  juvenile ',
 "Raggedtooth shark, 2 m [6.75'] ",
 "Raggedtooth shark, 1.5 m [5'] ",
 'Blue sharks',
 "Blacktip shark, 2 m [6.75'] ",
 "1.2 m  [4'] shark",
 'Grey reef shark ',
 'White shark, 4.2 m [13\'9"] ',
 "2.7 m [9'] white shark",
 'White shark, identified by tooth pattern',
 "Porbeagle shark, 3 m [10']rk",
 "a small shark'",
 '2 m to 2.5 m shark',
 '200-lb shark',
 "1.2 m [4'] blacktip or sandbar shark",
 "Blue shark, 3 m [10']",
 'Possibly juvenile tiger shark',
 'Copper shark, 50-kg [110-b] ',
 "3 m [10'] gaffed shark",
 "White shark, 2.7 m [9'], 280-lb ",
 "White shark, 2.4 m [8'] ",
 ' Bull shark, 1.2m ',
 'Bull shark, 1m ',
 "12' sandtiger shark",
 "Bull shark, 6' to 7'",
 "6 m [20'] shark",
 'small sharks',
 '4.5 m [14\'9"] white shark',
 '3.5 to 4 m shark',
 "1.5 m [5'] hammerhead shark",
 'Bull shark',
 '"A long thin brown-colored shark"',
 'Bull s

In [40]:
df.loc[df['Species'].str.startswith('Bull')] = 'Bull shark'
df.loc[df['Species'].str.startswith('Tiger')] = 'Tiger shark'
df.loc[df['Species'].str.startswith('White shark')] = 'White shark'
df.loc[df['Species'].str.startswith('Lemon')] = 'Lemon shark'
df.loc[df['Species'].str.startswith('Zambesi')] = 'Zambesi shark'
df.loc[df['Species'].str.startswith('Blue')] = 'Blue shark'
df.loc[df['Species'].str.startswith('\"sand\"')] = 'Sand shark'
df.loc[df['Species'].str.startswith(' Bull')] = 'Bull shark'
df.loc[df['Species'].str.startswith('Bronze')] = 'Bronze whaler shark'
df.loc[df['Species'].str.startswith('Hammerhead')] = 'Hammerhead shark'
df.loc[df['Species'].str.startswith('Raggedtooth')] = 'Raggedtooth shark'

In [41]:
df['Species'].value_counts()

White shark                               331
Tiger shark                               196
Bull shark                                116
Bronze whaler shark                        55
Raggedtooth shark                          41
                                         ... 
small sharks'                               1
300- to 400-lb Zambesi shark                1
3 m to 3.7 m [10' to 12'] shark             1
18" to 36" shark                            1
3.6 m  white shark  (or bronze whaler)      1
Name: Species, Length: 908, dtype: int64

In [42]:
df["Species"] = df["Species"].str.replace("1 m hammerhead shark", "Hammerhead shark")
df["Species"] = df["Species"].str.replace("1.5 m to 1.8 m [5' to 6']  blacktip shark", "Blacktip shark")
df["Species"] = df["Species"].str.replace("1.5 m to 1.8 m [5' to 6'] Zambesi shark", "Zambesi shark")
df["Species"] = df["Species"].str.replace("1.5 m to 1.8 m [5' to 6'] sandbar shark", "Sandbar shark")
df["Species"] = df["Species"].str.replace("\"a black-tipped shark\"", "Black-tipped shark")
df["Species"] = df["Species"].str.replace("\"a blue shark\"", "Blue shark")
df["Species"] = df["Species"].str.replace("\"a dog shark\"", "Dog shark")
df["Species"] = df["Species"].str.replace("\"a little shark\"", "Undefined species")
df["Species"] = df["Species"].str.replace("\"a large shark\"", "Undefined species")
df["Species"] = df["Species"].str.replace("\"a small shark\"", "Undefined species")
df["Species"] = df["Species"].str.replace("\"a young shark\"", "Undefined species")
df["Species"] = df["Species"].str.replace("\"reef shark\"", "Reef shark")
df["Species"] = df["Species"].str.replace("\"gummy\" shark (Rhizoprionodon or Loxodon) 1.2 m [4\']", "Undefined shark")
df["Species"] = df["Species"].str.replace("\"black tipped\" shark", "Black-tipped shark")
df["Species"] = df['Species'].str.replace('\"gray shark\"', 'Gray shark')
df["Species"] = df['Species'].str.replace('\"grey shark\"', 'Gray shark')

In [43]:
species_set2 = set(df["Species"])
species_set2

{' ',
 ' "gummy" shark (Rhizoprionodon or Loxodon) 1.2 m [4\']',
 " 1.5 m [5'] dusky shark",
 " Blacktip shark, C. maculipinnis. 1.9 m to 2.1 m [6.5' to 7'] ",
 " Galapagos shark, 6'",
 " Grey nurse shark,  3 m [10'] ",
 ' Lemon shark, 3.5 m',
 ' Raggedtooth shark, 2m',
 " Sandbar shark, 3' to 4'",
 " Sevengill  shark, 1.2 m [4'] ",
 ' Tawney nurse shark, 1m',
 ' Tiger shark, 2.8m',
 " Tiger shark, 3'",
 " White shark, 7' ",
 ' Wobbegong shark, 1.6 to 1.8m ',
 ' a small shark',
 ' reef shark, 1.8m',
 " white shark, 15' ",
 '"A 2\' (0.6 m) brown shark"',
 '"A long thin brown-colored shark"',
 '"A pack of sharks"',
 '"A small shark"',
 '"Attacked by a number of sharks"',
 '"Blue nose shark"',
 '"Blue nose sharks"',
 '"Blue whaler" (Galeolamna)',
 '"Dog shark"',
 '"Shark caught later"',
 '"Shark had a very large girth"',
 '"The fish was harpooned, dried, and presented to the sailor, who went round Europe exhibiting it  It was said to be 20 feet long.',
 '"grey-colored shark"',
 '"juvenile

In [None]:
Activity_set = set(df["Activity"])
Activity_set
df = df.drop("Activity", axis=1)

In [None]:
df["Type"].describe()

In [None]:
df["Fatal"].describe()

In [None]:
df

In [None]:
df.describe()

In [None]:
unprov_not_fatal = df[(df["Type"] == "Unprovoked") & (df["Fatal"] == "N")]

In [None]:
unprov_not_fatal.describe()

In [None]:
unprov_fatal = df[(df["Type"] == "Unprovoked") & (df["Fatal"] == "Y")]
unprov_fatal.describe()

In [None]:
prov_not_fatal = df[(df["Type"] == "Provoked") & (df["Fatal"] == "N")]
prov_not_fatal.describe()

In [None]:
prov_fatal = df[(df["Type"] == "Provoked") & (df["Fatal"] == "Y")]
prov_fatal.describe()

In [None]:
import matplotlib.pyplot as plt

# Data to plot
labels = 'Unprovoked not fatal', 'Unprovoked fatal', 'Provoked not fatal', 'Provoked fatal'
sizes = [1764, 381, 388, 6]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0, 0, 0, 0.1)  # explode 1st slice

# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=False, startangle=140)

plt.axis('equal')
plt.show()

In [None]:
# Basado en la data filtrada por categorias (ataques provocados o no provocados y ataque fatal o no fatal)
# podemos observar que de 2539 ataques 2145 fueron no provocados (84,5%) y respectivamente 
# 394 ataques fueron provocados (15.5%).

# De los ataques no provocados 1764 no fueron fatales (69.5%) y 381 fueron fatales (0.2%).
# De los ataques provocados 388 no fueron fatales (15.3%) y 6 fueron fatales (0.2%).

# Lo cual me basta para concluir que el tiburon 