# Shark Attack Project - Team 1

In [1]:
import pandas as pd
import os

In [2]:
url = "https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"
original_df = pd.read_excel(url)

In [3]:
# create working copy of original dataframe
df = original_df.copy()

In [4]:
# get first impression of dataset
df.head(3)

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,15 Mar 2024,2024.0,Unprovoked,AUSTRALIA,Queensland,Bargara Beach,Swimming,Brooklyn Sauer,F,13.0,...,Tiger shark,"Yahoo News, 3/15/2024",,,,,,,,
1,04 Mar 2024,2024.0,Unprovoked,USA,Hawaii,"Old Man's, Waikiki",Surfing,Matthew White,M,,...,Tiger shark 8',"Surfer, 3/6/2024F",,,,,,,,
2,02 Mar-2024,2024.0,Unprovoked,USA,Hawaii,"Rainbows, Oahu",Swimming,,F,11.0,...,3' to 4' shark,"Hawaii News Now, 3/4/2024",,,,,,,,


In [5]:
df.shape

(6969, 23)

In [6]:
# clean column names
df.columns

Index(['Date', 'Year', 'Type', 'Country', 'State', 'Location', 'Activity',
       'Name', 'Sex', 'Age', 'Injury', 'Unnamed: 11', 'Time', 'Species ',
       'Source', 'pdf', 'href formula', 'href', 'Case Number', 'Case Number.1',
       'original order', 'Unnamed: 21', 'Unnamed: 22'],
      dtype='object')

In [7]:
# remove whitespice behind column name for species
df = df.rename(columns = {"Species ": "Species"})

In [8]:
# choose certain variables that we are of interest for our problem statement
df = df[["Date", "Year", "Country", "Location", "Injury", "Species", "Activity"]]

In [9]:
# drop all rows where all columns are NaN
df = df.dropna(how='all')
df.shape

(6944, 7)

In [10]:
# drop all rows where 'Year' is NaN
df = df.dropna(subset = ['Year'])
df.head()


Unnamed: 0,Date,Year,Country,Location,Injury,Species,Activity
0,15 Mar 2024,2024.0,AUSTRALIA,Bargara Beach,"Minor injuries to back, abdomen and legs",Tiger shark,Swimming
1,04 Mar 2024,2024.0,USA,"Old Man's, Waikiki","No injury, shark bit surfboard",Tiger shark 8',Surfing
2,02 Mar-2024,2024.0,USA,"Rainbows, Oahu",Lacerations to left foot,3' to 4' shark,Swimming
3,25 Feb-2024,2024.0,AUSTRALIA,"Sandlnd Island, Jurian Bay",Leg bitten,Tiger shark,
4,14 Feb-2024,2024.0,INDIA,"Vaitarna River, Palghar District",Calf of lower left leg injured,"Bull shark, 7'",Fishing


In [11]:
# correct the way the Years are written (for example 2023.0 to 2023) [from string to integer]
df['Year'] = df['Year'].astype(int)
df.head(3)

Unnamed: 0,Date,Year,Country,Location,Injury,Species,Activity
0,15 Mar 2024,2024,AUSTRALIA,Bargara Beach,"Minor injuries to back, abdomen and legs",Tiger shark,Swimming
1,04 Mar 2024,2024,USA,"Old Man's, Waikiki","No injury, shark bit surfboard",Tiger shark 8',Surfing
2,02 Mar-2024,2024,USA,"Rainbows, Oahu",Lacerations to left foot,3' to 4' shark,Swimming


In [12]:
# only consider last 10 years
df = df[(df['Year'] >= 2015)]

In [13]:
# check how many incidents in each year
df['Year'].value_counts()

Year
2015    143
2017    139
2016    131
2018    124
2019    114
2021    109
2023    108
2020    101
2022     98
2024     10
Name: count, dtype: int64

In [14]:
# check which countries have high incident number
df.Country.value_counts()[0:5]

Country
USA             508
AUSTRALIA       248
BAHAMAS          43
SOUTH AFRICA     43
BRAZIL           21
Name: count, dtype: int64

In [15]:
# check if hotspot exists
df['Location'].value_counts()[0:5]

Location
New Smyrna Beach, Volusia County    48
Cocoa Beach, Brevard  County        13
Ponce Inlet, Volusia County         11
Jacksonville Beach, Duval County     7
Melbourne Beach, Brevard County      6
Name: count, dtype: int64

In [16]:
# explore injury types
df['Injury'].value_counts()[0:10]

Injury
FATAL                       77
Foot bitten                 20
Leg bitten                  12
Lacerations to left foot     9
Minor injury to foot         9
No injury                    8
No injury, board bitten      8
Hand bitten                  7
Lacerations to foot          7
Minor injuries               7
Name: count, dtype: int64

In [17]:
df['Injury'].nunique()

752

In [18]:
# classify injury types
injury_classification = ["No Injury", "Minor", "Fatal", "Lacerations", "Bitten", "Injured", "Punctures"]
df['Injury'] = df['Injury'].apply(lambda x: "No Injury" if "no injury" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "No Injury" if "nor injured" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Minor" if "minor" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Fatal" if "fatal" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Lacerations" if "laceration" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Bitten" if "bitten" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Bitten" if "bite" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Injured" if "injured" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Injured" if "injury" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Injured" if "injuries" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Punctures" if "puncture" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Punctures" if "puncture" in str(x).lower() else x)
df['Injury'] = df['Injury'].apply(lambda x: "Others" if x not in injury_classification else x)

In [19]:
df['Injury'].value_counts()

Injury
Injured        315
Lacerations    217
Bitten         183
Minor          133
Others         100
Fatal           90
Punctures       39
Name: count, dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1077 entries, 0 to 1077
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      1077 non-null   object
 1   Year      1077 non-null   int64 
 2   Country   1076 non-null   object
 3   Location  1037 non-null   object
 4   Injury    1077 non-null   object
 5   Species   635 non-null    object
 6   Activity  1024 non-null   object
dtypes: int64(1), object(6)
memory usage: 67.3+ KB


In [21]:
df.isnull().sum()

Date          0
Year          0
Country       1
Location     40
Injury        0
Species     442
Activity     53
dtype: int64

In [22]:
df.Country.value_counts()

Country
USA                 508
AUSTRALIA           248
BAHAMAS              43
SOUTH AFRICA         43
BRAZIL               21
                   ... 
PORTUGAL              1
TURKS and CaICOS      1
South Africa          1
Maldives              1
ATLANTIC OCEAN        1
Name: count, Length: 66, dtype: int64

In [23]:
#Group by 'Location' and 'Country' and count the occurrences(beachs in USA and AUSTRALIA)

# Filter the DataFrame for the desired countries
filtered_countries = df[df['Country'].isin(['USA', 'AUSTRALIA'])]

# Group by 'Location' and 'Country' and count the occurrences
grouped_location = filtered_countries.groupby(['Location', 'Country']).size().unstack(fill_value=0)

print(grouped_location["AUSTRALIA"].sum(), grouped_location["USA"].sum())

246 503


In [24]:
df.Country.value_counts()

Country
USA                 508
AUSTRALIA           248
BAHAMAS              43
SOUTH AFRICA         43
BRAZIL               21
                   ... 
PORTUGAL              1
TURKS and CaICOS      1
South Africa          1
Maldives              1
ATLANTIC OCEAN        1
Name: count, Length: 66, dtype: int64

In [25]:
# only consider incidents in USA

df_usa = df.loc[df['Country'] == 'USA']

df_usa.head()

Unnamed: 0,Date,Year,Country,Location,Injury,Species,Activity
1,04 Mar 2024,2024,USA,"Old Man's, Waikiki",Injured,Tiger shark 8',Surfing
2,02 Mar-2024,2024,USA,"Rainbows, Oahu",Lacerations,3' to 4' shark,Swimming
10,30 Dec-2023,2023,USA,"Baby Beach, Maui",Fatal,,Surfing
24,05 Nov-2023,2023,USA,"Juno Beach, Palm Beach County",Lacerations,,Swimming
29,25 Oct 2023,2023,USA,"Pua’ena Point, Haleiwa, Oahu",Bitten,8' tiger shark,Surfing


In [26]:
df_usa.Species.value_counts().head(10)

Species
White shark                        18
Shark involvement not confirmed    12
Tiger shark                        10
4' shark                            9
Bull shark                          7
4' to 5' shark                      6
3' to 4' shark                      6
Blacktip shark                      5
8' shark                            5
5' shark                            5
Name: count, dtype: int64

In [27]:
#Formating the date column

import re

df_usa['Date'] = df_usa['Date'].apply(lambda x: re.sub(r'[-\s,]', '', str(x)))
df_usa['Date'] = df_usa['Date'].apply(lambda x: re.sub(r'(\d{4})', r'\1-', str(x)))
df_usa['Date'] = pd.to_datetime(df_usa['Date'], errors='coerce')
df_usa['Date'] = df_usa['Date'].dt.strftime('%d-%m-%y')


df_usa.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa['Date'] = df_usa['Date'].apply(lambda x: re.sub(r'[-\s,]', '', str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa['Date'] = df_usa['Date'].apply(lambda x: re.sub(r'(\d{4})', r'\1-', str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa['Date'] = pd.to_datetime(df_usa['Da

Unnamed: 0,Date,Year,Country,Location,Injury,Species,Activity
1,04-03-24,2024,USA,"Old Man's, Waikiki",Injured,Tiger shark 8',Surfing
2,02-03-24,2024,USA,"Rainbows, Oahu",Lacerations,3' to 4' shark,Swimming
10,30-12-23,2023,USA,"Baby Beach, Maui",Fatal,,Surfing
24,05-11-23,2023,USA,"Juno Beach, Palm Beach County",Lacerations,,Swimming
29,25-10-23,2023,USA,"Pua’ena Point, Haleiwa, Oahu",Bitten,8' tiger shark,Surfing


In [39]:
df_usa.dtypes

Date        datetime64[ns]
Year                 int64
Country             object
Location            object
Injury              object
Species             object
Activity            object
Season              object
dtype: object

In [41]:
def get_season(Date):
    if pd.isna(Date):
        return 'Unknown'
    month = pd.to_datetime(Date).month #Convert Date to datetime object and extract the month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

df_usa['Season'] = df_usa['Date'].apply(get_season)

df_usa.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa['Season'] = df_usa['Date'].apply(get_season)


Unnamed: 0,Date,Year,Country,Location,Injury,Species,Activity,Season
1,2024-04-03,2024,USA,"Old Man's, Waikiki",Injured,Tiger shark 8',Surfing,Spring
2,2024-02-03,2024,USA,"Rainbows, Oahu",Lacerations,3' to 4' shark,Swimming,Winter
10,2023-12-30,2023,USA,"Baby Beach, Maui",Fatal,,Surfing,Winter
24,2023-05-11,2023,USA,"Juno Beach, Palm Beach County",Lacerations,,Swimming,Spring
29,2023-10-25,2023,USA,"Pua’ena Point, Haleiwa, Oahu",Bitten,8' tiger shark,Surfing,Fall


In [None]:
df_usa.Date.value_counts()

Date
20-09-15    4
02-09-17    3
03-08-19    3
23-10-18    3
18-09-16    3
           ..
02-08-20    1
10-08-20    1
19-08-20    1
20-08-20    1
01-01-15    1
Name: count, Length: 427, dtype: int64

In [None]:
#Standardize the species names

df_usa['Species'] = df_usa['Species'].apply(lambda x: "White Shark" if "white shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Tiger Shark" if "tiger shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Bull Shark" if "bull shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Nurse Shark" if "nurse shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Blacktip Shark" if "blacktip shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Hammerhead Shark" if "hammerhead shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Mako Shark" if "mako shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Lemon Shark" if "lemon shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Blue Shark" if "blue shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Spinner Shark" if "spinner shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Sand Tiger Shark" if "sand tiger shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Dusky Shark" if "dusky shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Caribbean Reef Shark" if "caribbean reef shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Galapagos Shark" if "galapagos shark" in str(x).lower() else x)
df_usa['Species'] = df_usa['Species'].apply(lambda x: "Zambesi Shark" if "zambesi shark" in str(x).lower() else x)

df_usa.Species.value_counts().head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa['Species'] = df_usa['Species'].apply(lambda x: "White Shark" if "white shark" in str(x).lower() else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa['Species'] = df_usa['Species'].apply(lambda x: "Tiger Shark" if "tiger shark" in str(x).lower() else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Species
White Shark                        42
Tiger Shark                        40
Bull Shark                         26
Blacktip Shark                     21
Shark involvement not confirmed    12
Spinner Shark                      11
Nurse Shark                        11
4' shark                            9
3' to 4' shark                      6
4' to 5' shark                      6
Name: count, dtype: int64

In [None]:
df_usa.head()

Unnamed: 0,Date,Year,Country,Location,Injury,Species,Activity
1,04-03-24,2024,USA,"Old Man's, Waikiki",Injured,Tiger Shark,Surfing
2,02-03-24,2024,USA,"Rainbows, Oahu",Lacerations,3' to 4' shark,Swimming
10,30-12-23,2023,USA,"Baby Beach, Maui",Fatal,,Surfing
24,05-11-23,2023,USA,"Juno Beach, Palm Beach County",Lacerations,,Swimming
29,25-10-23,2023,USA,"Pua’ena Point, Haleiwa, Oahu",Bitten,Tiger Shark,Surfing


In [None]:
df_usa.Species.fillna("Not Specified", inplace=True)

df_usa.Species.value_counts().head(10)

df_usa.Species.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa.Species.fillna("Not Specified", inplace=True)


0