In [1]:
import pandas as pd
url = 'https://www.sharkattackfile.net/spreadsheets/GSAF5.xls'
pd.set_option('display.max_columns', None)
df = pd.read_excel(url)

# Duplicated values - None

In [2]:
print(df.duplicated().sum()) # checking for duplicates -> 0

24


# Deleting columns that are not useful

In [3]:
df. drop(['href formula','Unnamed: 11','href', 'Case Number', 'Case Number.1',
       'original order', 'Unnamed: 21', 'Unnamed: 22'], axis=1,inplace=True)

In [4]:
df # state of dataframe

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,Injury,Time,Species,Source,pdf
0,14 Feb-2024,2024.0,Unprovoked,INDIA,Maharashtra,"Vaitarna River, Palghar District",Fishing,Vicky Suresh Govari,M,32,Calf of lower left leg injured,,"Bull shark, 7'","Times of India, 2/14/2024",
1,04-Feb-2024,2024.0,Provoked,TRINIDAD,,,Spearfishing,male,M,,Shoulder bitten,06h30,Blacktip reef shark,"Trinidad Guardian, 2/11/2014",
2,29 Jan-2024,2024.0,Unprovoked,AUSTRALIA,New South Wales,"Elizabeth Bay, Sydney Harbor",Swimming,Lauren O'Neill,F,29,Right leg bitten,20h00,Bull shark,"Nine News, 12/31/2024",
3,15 Jan-2024,2024.0,Unprovoked,BAHAMAS,Paradise Island,Paradise Island Resort,Swimming,male,M,10,Right leg injured,16h00,,"Caribbean Loop News, 1/15;2024",
4,09-Jan-2024,2024.0,Unprovoked,AUSTRALIA,South Australia,"Walkers Beach, Elliston",Surfing,Murray Adams,M,64,Leg bitten,13h00,White shark,"A. Currie, GSAF",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6960,,,,,,,,,,,,,,,
6961,,,,,,,,,,,,,,,
6962,,,,,,,,,,,,,,,
6963,,,,,,,,,,,,,,,


# Deleting null values

In [5]:
print(df.isnull().sum()) # checking how many null values to each column

Date          25
Year          27
Type          43
Country       75
State        507
Location     590
Activity     610
Name         244
Sex          604
Age         3018
Injury        60
Time        3550
Species     3157
Source        44
pdf          166
dtype: int64


In [6]:
print(f"The number of rows before the dropna are: {df.shape[0]}.") # 6965 rows

The number of rows before the dropna are: 6965.


In [7]:
rows_with_all_null = df[df.isnull().all(axis=1)]
print(f"{rows_with_all_null.shape[0]} rows with all null values were found:")
print(rows_with_all_null)
# it found 25 rows

25 rows with all null values were found:
     Date  Year Type Country State Location Activity Name  Sex  Age Injury  \
6940  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6941  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6942  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6943  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6944  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6945  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6946  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6947  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6948  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6949  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6950  NaN   NaN  NaN     NaN   NaN      NaN      NaN  NaN  NaN  NaN    NaN   
6951  NaN   NaN  NaN   

In [8]:
df.dropna(how='all', inplace=True)

In [9]:
print(f"The number of rows after the dropna are: {df.shape[0]}.") # 6940 rows -> difference: 25, correct

The number of rows after the dropna are: 6940.


# Date

##### -> Date needs to be cleaned!

In [10]:
df["Date"]

0       14 Feb-2024
1       04-Feb-2024
2       29 Jan-2024
3       15 Jan-2024
4       09-Jan-2024
           ...     
6935    Before 1903
6936    Before 1903
6937      1900-1905
6938      1883-1889
6939      1845-1853
Name: Date, Length: 6940, dtype: object

In [11]:
df["Date"].head(30) # many different formats for date

0              14 Feb-2024
1              04-Feb-2024
2              29 Jan-2024
3              15 Jan-2024
4              09-Jan-2024
5              05-Jan-2024
6              30 Dec-2023
7              29 Dec-2023
8              28 Dec-2023
9              25 Dec-2023
10             24-Dec-2023
11             18 Dec-2023
12            14  Dec-2023
13            14  Dec-2023
14             08 Dec-2023
15             04 Dec-2023
16             02 Dec-2023
17             30 Nov-2023
18             21 Nov-2023
19             10 Nov-2023
20             05 Nov-2023
21             03 Nov-2023
22             02 Nov-2023
23    Reported 02 Nov-2023
24             31 Oct-2023
25             25 Oct 2023
26             23 Oct-2023
27             21 Oct-2023
28             20 Oct 2023
29             15 Oct 2023
Name: Date, dtype: object

##### Month check

In [12]:
df['Month'] = df['Date'].str.replace("Between", " ").str.replace("Fall", " ").str.replace("fall", " ").str.replace("summer", " ").str.replace("Early", " ").str.replace("Reported", " ").str.replace("Before", " ").str.replace("Summer", " ").str.replace("Late", " ").str.extract(r'([A-Za-z]{3})')

In [13]:
# I ignored Reported, cause I assume that it doesn't make much difference
# usually incidents will be reported max a few days after, not months after

In [14]:
'''
a = df[df["Month"] == "Rep"]
a # here I see that the Rep is actually for reported, there were 500+ entries with Reported
'''

'\na = df[df["Month"] == "Rep"]\na # here I see that the Rep is actually for reported, there were 500+ entries with Reported\n'

In [15]:
df['Month'].value_counts() # July and August are the most active months

Month
Jul    771
Aug    661
Sep    606
Jan    561
Jun    545
Oct    498
Apr    492
Dec    489
Mar    454
Nov    449
May    448
Feb    415
dat     26
Mid      5
Cir      5
Win      2
Wor      2
sam      2
Som      2
Dur      2
Rep      2
Let      1
Sai      1
the      1
Nox      1
Aft      1
Pri      1
mid      1
Ann      1
and      1
soo      1
hav      1
Woi      1
Las      1
few      1
Name: count, dtype: int64

# df - DISPLAY




In [16]:
df

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,Injury,Time,Species,Source,pdf,Month
0,14 Feb-2024,2024.0,Unprovoked,INDIA,Maharashtra,"Vaitarna River, Palghar District",Fishing,Vicky Suresh Govari,M,32,Calf of lower left leg injured,,"Bull shark, 7'","Times of India, 2/14/2024",,Feb
1,04-Feb-2024,2024.0,Provoked,TRINIDAD,,,Spearfishing,male,M,,Shoulder bitten,06h30,Blacktip reef shark,"Trinidad Guardian, 2/11/2014",,Feb
2,29 Jan-2024,2024.0,Unprovoked,AUSTRALIA,New South Wales,"Elizabeth Bay, Sydney Harbor",Swimming,Lauren O'Neill,F,29,Right leg bitten,20h00,Bull shark,"Nine News, 12/31/2024",,Jan
3,15 Jan-2024,2024.0,Unprovoked,BAHAMAS,Paradise Island,Paradise Island Resort,Swimming,male,M,10,Right leg injured,16h00,,"Caribbean Loop News, 1/15;2024",,Jan
4,09-Jan-2024,2024.0,Unprovoked,AUSTRALIA,South Australia,"Walkers Beach, Elliston",Surfing,Murray Adams,M,64,Leg bitten,13h00,White shark,"A. Currie, GSAF",,Jan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6935,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,
6936,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,
6937,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,
6938,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,FATAL,,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,


In [17]:
df.columns

Index(['Date', 'Year', 'Type', 'Country', 'State', 'Location', 'Activity',
       'Name', 'Sex', 'Age', 'Injury', 'Time', 'Species ', 'Source', 'pdf',
       'Month'],
      dtype='object')

In [18]:
df.isna().sum(axis=1)

0       2
1       4
2       1
3       2
4       1
       ..
6935    4
6936    5
6937    4
6938    6
6939    3
Length: 6940, dtype: int64

In [19]:
#Lose PDF column
#Lose State
#Handle all the null
#Time and age replace by avg
#Delete rows with a majority of nulls
#Check for duplicates
#Format Data

In [20]:
df.isna().sum()

Date           0
Year           2
Type          18
Country       50
State        482
Location     565
Activity     585
Name         219
Sex          579
Age         2993
Injury        35
Time        3525
Species     3132
Source        19
pdf          141
Month        489
dtype: int64

# Data Cleaning

In [None]:
#1. Lose PDF, source, location columns✅
#2. Handle all the null
#Time and age replace by avg✅
#Delete rows with a majority of nulls
#4.Check for duplicates
#3.Format Data


In [21]:
#1. Lose PDF, source, location columns

columns_to_drop = ['Location', 'pdf', 'Source']
df.drop(columns_to_drop, axis=1, inplace=True)

In [27]:
#2. Handle all the null
df.isnull().sum()
df.isnull().any()
df.isnull().sum(axis=1)
df.dropna(how='all', inplace=True) #drop all rows with all of the 13 values null (-24 rows)

# Convert 'Age' column to numeric type to ensure all values are numeric
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
# Replace null values in the 'Age' column with the average age
df['Age'].fillna(df['Age'].mean(), inplace=True)

df['Time'] = pd.to_numeric(df['Time'], errors='coerce')
df['Time'].fillna(df['Time'].mean(), inplace=True)
df.dropna(subset=['Country'], inplace=True)
#df.dropna(thresh=10)#with 13 we have 3054 rows, 12 >> 5773, 11 >> 6555

In [28]:
df.isnull().sum()

Date           0
Year           2
Type          18
Country        0
State        447
Activity     578
Name         215
Sex          575
Age            0
Injury        33
Time           0
Species     3094
Month        478
dtype: int64

In [29]:
df

Unnamed: 0,Date,Year,Type,Country,State,Activity,Name,Sex,Age,Injury,Time,Species,Month
0,14 Feb-2024,2024.0,Unprovoked,INDIA,Maharashtra,Fishing,Vicky Suresh Govari,M,32.000000,Calf of lower left leg injured,1249.285714,"Bull shark, 7'",Feb
1,04-Feb-2024,2024.0,Provoked,TRINIDAD,,Spearfishing,male,M,28.046658,Shoulder bitten,1249.285714,Blacktip reef shark,Feb
2,29 Jan-2024,2024.0,Unprovoked,AUSTRALIA,New South Wales,Swimming,Lauren O'Neill,F,29.000000,Right leg bitten,1249.285714,Bull shark,Jan
3,15 Jan-2024,2024.0,Unprovoked,BAHAMAS,Paradise Island,Swimming,male,M,10.000000,Right leg injured,1249.285714,,Jan
4,09-Jan-2024,2024.0,Unprovoked,AUSTRALIA,South Australia,Surfing,Murray Adams,M,64.000000,Leg bitten,1249.285714,White shark,Jan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6935,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Diving,male,M,28.046658,FATAL,1249.285714,,
6936,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Pearl diving,Ahmun,M,28.046658,FATAL,1249.285714,,
6937,1900-1905,0.0,Unprovoked,USA,North Carolina,Swimming,Coast Guard personnel,M,28.046658,FATAL,1249.285714,,
6938,1883-1889,0.0,Unprovoked,PANAMA,,,Jules Patterson,M,28.046658,FATAL,1249.285714,,
