### Shark attaks


In [1]:
!pip install pandas
import pandas as pd



In [2]:
!pip install numpy
import numpy as np




In [3]:
df_50_last_years = pd.read_csv("data/interim/1_interim_clean_50_years_countries_sex_fatal.csv")
df_50_last_years.head()

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,Injury,Fatal Y/N,Species
0,28th July,2025.0,Unprovoked,Australia,South Australia,Seacliff beach Adelaide,Kayaking,Nathaniel Drummond,M,19,Kayak badly damaged no injury to person,N,Great White Shark est 3m (10ft)
1,2025-06-11 00:00:00,2025.0,Unprovoked,Usa,Florida,Boca Grande,Snorkeling,Leah Lendel,F,9,Right hand almost torn off,N,Bull shark
2,17th June,2025.0,Unprovoked,Usa,South Carolina,Beach Marker 24 Hilton Head Island,Swimming,Pre-teenage girl,F,12,Laceration to lower right leg,N,Not stated
3,14th October,2025.0,Unprovoked,Columbia,"Bolivar, del Isolate",Catagena Province,Swimming with sharks,Male child,M,14,Severe hand injury,N,Nurse shark
4,7th October,2025.0,Unprovoked,Australia,South Australia,Kangaroo Island,Surfing,Lee Berryman,M,50+,Lacerations to calf,N,Bronze whaler?


In [4]:
df = df_50_last_years

# From this point the cleaning starts

In [5]:
df['Date_raw'] = df['Date'] # keeping original Date column for potential season extraction

In [6]:
df.columns = df.columns.str.strip()
if "Date" not in df.columns:
    raise KeyError(f"'Date' column not found. Columns: {list(df.columns)}") # ensure 'Date' column exists

df["Date_raw"] = df["Date"].astype(str) # keep the raw for QA

In [7]:
s = df["Date"].astype(str).str.strip()
s = s.str.replace(r"(?<=\d)(st|nd|rd|th)", "", regex=True)
s = s.str.replace(r"\b\d{3,4}\s*hrs\b", "", regex=True, )
s = s.replace([r"^\s*(not\s*stated|unknown|undetermined|\?|nan)\s*$", ""],
              np.nan, regex=True) # normalize placeholders to NaN

In [8]:
parsed = pd.to_datetime(s, errors="coerce", dayfirst=True) # initial parse attempt

  parsed = pd.to_datetime(s, errors="coerce", dayfirst=True) # initial parse attempt


In [9]:
mname = s.str.extract(r"(?i)\b(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b", expand=False)
month_map = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,
             "jul":7,"aug":8,"sep":9,"sept":9,"oct":10,"nov":11,"dec":12}
mnum = mname.str.lower().map(month_map)

year_series = pd.to_numeric(df.get("Year", np.nan), errors="coerce")
mask = parsed.isna() & mnum.notna() & year_series.notna()
parsed.loc[mask] = pd.to_datetime(
    dict(year=year_series[mask].astype(int), month=mnum[mask].astype(int), day=1),
    errors="coerce"
) # construct dates where month name and year exist

In [10]:
df["Date_parsed"] = parsed  
print("Date_parsed dtype:", df["Date_parsed"].dtype) # ensure datetime64[ns]

Date_parsed dtype: datetime64[ns]


In [11]:
def month_to_season(m):
    if pd.isna(m): return None
    if m in [12,1,2]:  return "Winter"
    if m in [3,4,5]:   return "Spring"
    if m in [6,7,8]:   return "Summer"
    return "Autumn"

df["Season"] = df["Date_parsed"].dt.month.map(month_to_season)

df.loc[df["Season"].isna() & mnum.notna(), "Season"] = mnum.map(month_to_season)

df["Season"] = pd.Categorical(df["Season"],
                              categories=["Winter","Spring","Summer","Autumn"],
                              ordered=True) # ordered category (nice for sorting/plots)

In [12]:
print("\nSeason counts (incl. missing):")
print(df["Season"].value_counts(dropna=False))
display(df[["Date_raw","Date_parsed","Season"]].head(12)) # show sample results


Season counts (incl. missing):
Season
Summer    1214
Autumn     975
Spring     877
Winter     810
NaN         54
Name: count, dtype: int64


Unnamed: 0,Date_raw,Date_parsed,Season
0,28th July,NaT,
1,2025-06-11 00:00:00,2025-06-11,Summer
2,17th June,NaT,
3,14th October,NaT,
4,7th October,NaT,
5,11th October,NaT,
6,29th September,NaT,
7,27th September,NaT,
8,2025-01-02 00:00:00,2025-01-02,Winter
9,15th July,NaT,


In [None]:
df_50_last_years["Date"].unique

<bound method Series.unique of 0                  28th July
1        2025-06-11 00:00:00
2                  17th June
3               14th October
4                7th October
                ...         
3925             01-Jun-1976
3926    Reported 02-Jun-1976
3927             09-Mar-1976
3928             12-Mar-1976
3929             12-Mar-1976
Name: Date, Length: 3930, dtype: object>

In [14]:
df_50_last_years.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3930 entries, 0 to 3929
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         3930 non-null   object        
 1   Year         3930 non-null   float64       
 2   Type         3916 non-null   object        
 3   Country      3930 non-null   object        
 4   State        3768 non-null   object        
 5   Location     3758 non-null   object        
 6   Activity     3716 non-null   object        
 7   Name         3858 non-null   object        
 8   Sex          3701 non-null   object        
 9   Age          2861 non-null   object        
 10  Injury       3917 non-null   object        
 11  Fatal Y/N    3637 non-null   object        
 12  Species      2552 non-null   object        
 13  Date_raw     3930 non-null   object        
 14  Date_parsed  3876 non-null   datetime64[ns]
 15  Season       3876 non-null   category      
dtypes: cat

## Injury severity

In [15]:
# Function to classify injury severity
def classify_injury(text):
    if pd.isna(text):
        return np.nan

    text = text.lower()

    # Severe injuries
    severe_keywords = [
        "fatal", "death", "amputation", "severed", "torn off",
        "massive", "critical", "major", "severe", "life-threatening",
        "lost", "missing limb", "unconscious"
    ]

    # Medium injuries
    medium_keywords = [
        "deep", "laceration", "multiple bites", "serious", "fracture",
        "severe bleeding", "puncture", "bitten", "torn", "damaged"
    ]

    # Low injuries
    low_keywords = [
        "minor", "small", "superficial", "scratch", "abrasion",
        "bruise", "nipped", "no injury", "slight"
    ]

    if any(word in text for word in severe_keywords):
        return "Severe"
    elif any(word in text for word in medium_keywords):
        return "Medium"
    elif any(word in text for word in low_keywords):
        return "Low"
    else:
        return "Unknown"

In [16]:
df_50_last_years['Injury_Severity'] = df_50_last_years['Injury'].apply(classify_injury) # classify injury severity

In [17]:
# Display injury severity counts and sample data
print(df_50_last_years['Injury_Severity'].value_counts(dropna=False))
display(df_50_last_years[['Date', 'Activity', 'Injury', 'Injury_Severity']].head(10))

Injury_Severity
Medium     1904
Unknown     771
Severe      632
Low         610
NaN          13
Name: count, dtype: int64


Unnamed: 0,Date,Activity,Injury,Injury_Severity
0,28th July,Kayaking,Kayak badly damaged no injury to person,Medium
1,2025-06-11 00:00:00,Snorkeling,Right hand almost torn off,Severe
2,17th June,Swimming,Laceration to lower right leg,Medium
3,14th October,Swimming with sharks,Severe hand injury,Severe
4,7th October,Surfing,Lacerations to calf,Medium
5,11th October,Fishing/swimming,Serious abdonminal injuries,Medium
6,29th September,Swimming,Leg and foot injury,Unknown
7,27th September,Diving-Tagging sharks,Head face and arms,Unknown
8,2025-01-02 00:00:00,Spearfishing,Severe arm injury and delay in medical treatme...,Severe
9,15th July,Spearfishing,Lacerations to right arm,Medium
