In [25]:
import seaborn as sns

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn import set_config
set_config(transform_output="pandas")

# Set random seed 
RSEED = 42

warnings.filterwarnings("ignore")

In [26]:
df = pd.read_csv("data/data.csv")

In [27]:
df.head()

Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,...,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease
0,2000_RE_0060,2000-12-22,Researcher_25,CaptureSite_0,Ocean,Net,Fisher_1072,LandingSite_CaptureSiteCategory_2,Species_6,CC00147,...,,,64.7,62.6,,Unknown,algae at rear of shell,Released,ReleaseSite_50,22/12/00
1,2001_RE_0187,2001-10-28,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_520,LandingSite_CaptureSiteCategory_2,Species_6,W442,...,,,35.85,31.35,,Unknown,multiple b's on front flippers& a lot of algae growth on shall - mostly towards rear,Released,ReleaseSite_62,28/10/01
2,2001_RE_0197,2001-11-01,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_1669,LandingSite_CaptureSiteCategory_2,Species_5,KE0376,...,,,51.8,49.2,,Unknown,clean,Released,ReleaseSite_50,01/11/01
3,2002_RE_0031,2002-03-11,Researcher_32,CaptureSite_0,Ocean,Net,Fisher_1798,LandingSite_CaptureSiteCategory_2,Species_6,CC00302,...,,,60.5,59.0,,Unknown,1 b 3 CS+ calcerous algae at rear end of shell+ 9/10+ 10/11 RM has chips+ 9/10 LM has chip+ Left supracaudal is broken a bit at the end+ RF flipper is 1/2 missing and LF flipper the end is mising+ 'nails' are growing at the ends. Ends of RR and LR flip a,Released,ReleaseSite_50,11/03/02
4,2002_RE_0118,2002-08-08,Researcher_25,CaptureSite_0,Ocean,Beached,Fisher_1918,LandingSite_CaptureSiteCategory_2,Species_5,NotTagged_0113,...,,,34.7,33.0,,Unknown,very lively+ right eye is hanging out + swollen+ left eye is closed + bleeding-possible from a speargun or infection or virus+ hump in 2 LLS + 2/3 CS,Released,ReleaseSite_62,08/08/02


In [28]:
variable_def = pd.read_csv("data/variable_definitions.csv",encoding='latin-1')
pd.set_option('display.max_colwidth', None)
variable_def

Unnamed: 0,Variables,Description
0,Rescue_ID,"Its an individual bycatch incidence identity number. The numbers are consecutive, for each year e.g. 2018_RE_0732 means rescue number 732 in year 2018"
1,Date_TimeCaught,Date the turtle is captured e.g. 06/01/2018
2,Researcher,Name of bycatch officer(s) involved with specific rescue
3,Capture Site,"Area where turtle was captured, as reported by the fisher."
4,Foraging Ground,General area of ocean area where turtle was captured. The assumption is that the turtle was foraging where it was captured. The foraging area is broadly classified either as the open ocean or creek section
5,Capture Method,Fishing gear or method used by fishers to capture the turtle
6,Fisher,Name of the fisher who captured the turtle
7,Landing_Site,Section of beach where turtle is landed.
8,Species,"Species of turtle (e.g. green turtle, hawksbill, loggerhead etc.)"
9,Tag_1,"Individual / unique number used to identify a turtle. Each turtle that is captured is tagged. Formats of tag numbers have been changed over years, although turtles retain their old tags even when recaptured (unless the tags are about to fall or are too tight, then they are replaced)."


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Rescue_ID              18062 non-null  object 
 1   Date_TimeCaught        18062 non-null  object 
 2   Researcher             18062 non-null  object 
 3   CaptureSite            18062 non-null  object 
 4   ForagingGround         18062 non-null  object 
 5   CaptureMethod          18062 non-null  object 
 6   Fisher                 18062 non-null  object 
 7   LandingSite            18062 non-null  object 
 8   Species                18062 non-null  object 
 9   Tag_1                  17937 non-null  object 
 10  Tag_2                  4911 non-null   object 
 11  Lost_Tags              925 non-null    object 
 12  T_Number               38 non-null     object 
 13  CCL_cm                 18038 non-null  float64
 14  CCW_cm                 18035 non-null  float64
 15  We

In [30]:
df.isnull().sum()

Rescue_ID                    0
Date_TimeCaught              0
Researcher                   0
CaptureSite                  0
ForagingGround               0
CaptureMethod                0
Fisher                       0
LandingSite                  0
Species                      0
Tag_1                      125
Tag_2                    13151
Lost_Tags                17137
T_Number                 18024
CCL_cm                      24
CCW_cm                      27
Weight_Kg                 5409
Sex                       4330
TurtleCharacteristics       52
Status                    3633
ReleaseSite                 75
Date_TimeRelease          6108
dtype: int64

In [31]:
# no duplicates of the rescue IDs
df.nunique()

Rescue_ID                18062
Date_TimeCaught           5237
Researcher                  35
CaptureSite                 29
ForagingGround               3
CaptureMethod               15
Fisher                    2085
LandingSite                  5
Species                      8
Tag_1                     8235
Tag_2                      246
Lost_Tags                  167
T_Number                    38
CCL_cm                    1338
CCW_cm                    1262
Weight_Kg                 1937
Sex                          4
TurtleCharacteristics    16342
Status                     439
ReleaseSite                271
Date_TimeRelease          3008
dtype: int64

## First data Cleaning

In [32]:
#replace uppercase letters with lowercase in column names
df.columns = df.columns.str.lower()

#format column names
df = df.rename({'date_timecaught':'date_caught', 'capturesite':'capture_site', 'foragingground':'foraging_ground',
                'capturemethod':'capture_method', 'landingsite':'landing_site', 'turtlecharacteristics':'turtle_characteristics',
                'releasesite':'release_site', 'date_timerelease':'date_released',},axis=1)


In [33]:
# Dropping not needed columns
df.drop(["rescue_id", "fisher", "researcher", "sex","turtle_characteristics", "tag_1", "lost_tags"], axis=1, inplace=True)

In [34]:
#convert date column to datetime type
import datetime
df['date_caught'] = pd.to_datetime(df['date_caught'])
df['date_released'] = pd.to_datetime(df['date_released'], errors='coerce')

In [35]:
# converting all entries into lower case to get rid of "Creek" and "creek"
df["foraging_ground"] = df["foraging_ground"].apply(lambda x: x.lower())

# change the types to 0 and 1, "ocean" = 1, "creek" = 0
df["foraging_ground"] = df["foraging_ground"].apply(lambda x: 1 if x == "ocean" else 0)

df['tag_2'].fillna(0, inplace=True) 

# Replacing string values in Tag_2 column (which represent a large turtle) with 1:
df['tag_2'] = df['tag_2'].replace(to_replace='.*', value=1, regex=True)

df['tag_2'].unique()
df['tag_2'].value_counts()

# replacing nan values in column t_number with 0
df['t_number'].fillna(0, inplace=True) 

In [36]:
# Replacing string values in T-Number column (which represent a visit in a rehabilitation facility ) with 1:
df['t_number'] = df['t_number'].replace(to_replace='.*', value=1, regex=True)

df['t_number'].unique()

array([0, 1])

In [37]:
#Impute NaN CCL_cm values, setting all of them as median
ccl_cm_median = df['ccl_cm'].median()
df['ccl_cm'].fillna(ccl_cm_median, inplace=True) 
df.isnull().sum()

date_caught           0
capture_site          0
foraging_ground       0
capture_method        0
landing_site          0
species               0
tag_2                 0
t_number              0
ccl_cm                0
ccw_cm               27
weight_kg          5409
status             3633
release_site         75
date_released      6573
dtype: int64

In [38]:
# change to an int to take up less memory
df["foraging_ground"].astype(bool)

0        True
1        True
2        True
3        True
4        True
         ... 
18057    True
18058    True
18059    True
18060    True
18061    True
Name: foraging_ground, Length: 18062, dtype: bool

In [39]:
df["capture_method"] = df["capture_method"].apply(lambda x: x.lower())

In [40]:
df["landing_site"].unique()

array(['LandingSite_CaptureSiteCategory_2',
       'LandingSite_CaptureSiteCategory_0',
       'LandingSite_CaptureSiteCategory_4',
       'LandingSite_CaptureSiteCategory_1',
       'LandingSite_CaptureSiteCategory_3'], dtype=object)

In [41]:
df.nunique()

date_caught        5237
capture_site         29
foraging_ground       2
capture_method       11
landing_site          5
species               8
tag_2                 2
t_number              2
ccl_cm             1338
ccw_cm             1262
weight_kg          1937
status              439
release_site        271
date_released      2900
dtype: int64

In [42]:
# One-hot encode the 'features' data using pandas.get_dummies()
categorical_features = ["capture_method", "foraging_ground", "landing_site", "species", "capture_site"]
df = pd.get_dummies(df,columns = categorical_features)
df.head()

Unnamed: 0,date_caught,tag_2,t_number,ccl_cm,ccw_cm,weight_kg,status,release_site,date_released,capture_method_beached,...,capture_site_CaptureSite_26,capture_site_CaptureSite_27,capture_site_CaptureSite_28,capture_site_CaptureSite_3,capture_site_CaptureSite_4,capture_site_CaptureSite_5,capture_site_CaptureSite_6,capture_site_CaptureSite_7,capture_site_CaptureSite_8,capture_site_CaptureSite_9
0,2000-12-22,0,0,64.7,62.6,,Released,ReleaseSite_50,2000-12-22,False,...,False,False,False,False,False,False,False,False,False,False
1,2001-10-28,0,0,35.85,31.35,,Released,ReleaseSite_62,2001-10-28,False,...,False,False,False,False,False,False,False,False,False,False
2,2001-11-01,0,0,51.8,49.2,,Released,ReleaseSite_50,2001-01-11,False,...,False,False,False,False,False,False,False,False,False,False
3,2002-03-11,0,0,60.5,59.0,,Released,ReleaseSite_50,2002-11-03,False,...,False,False,False,False,False,False,False,False,False,False
4,2002-08-08,0,0,34.7,33.0,,Released,ReleaseSite_62,2002-08-08,True,...,False,False,False,False,False,False,False,False,False,False


In [43]:
#fill ccw_cm NaNs with mean
ccw_mean = df['ccw_cm'].mean()
df['ccw_cm'].fillna(ccw_mean, inplace=True)

In [44]:
#fill weight NaNs with mode (8.5)
weight_mode = df["weight_kg"].mode()
df['weight_kg'].fillna(weight_mode, inplace=True)

In [45]:
#clean up status and convert to category
df['status'] = df['status'].astype('category')

#df['status'].fillna('none', inplace=True)

In [46]:
#fill release_site NaNs with mode and convert to category
df['release_site'] = df['release_site'].astype('category')

#df["landing_site"] = df["landing_site"].astype("category")
df['release_site'].fillna(df['release_site'].mode()[0], inplace=True)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 64 columns):
 #   Column                                          Non-Null Count  Dtype         
---  ------                                          --------------  -----         
 0   date_caught                                     18062 non-null  datetime64[ns]
 1   tag_2                                           18062 non-null  int64         
 2   t_number                                        18062 non-null  int64         
 3   ccl_cm                                          18062 non-null  float64       
 4   ccw_cm                                          18062 non-null  float64       
 5   weight_kg                                       12654 non-null  float64       
 6   status                                          14429 non-null  category      
 7   release_site                                    18062 non-null  category      
 8   date_released                                 

In [48]:
df.head()

Unnamed: 0,date_caught,tag_2,t_number,ccl_cm,ccw_cm,weight_kg,status,release_site,date_released,capture_method_beached,...,capture_site_CaptureSite_26,capture_site_CaptureSite_27,capture_site_CaptureSite_28,capture_site_CaptureSite_3,capture_site_CaptureSite_4,capture_site_CaptureSite_5,capture_site_CaptureSite_6,capture_site_CaptureSite_7,capture_site_CaptureSite_8,capture_site_CaptureSite_9
0,2000-12-22,0,0,64.7,62.6,8.5,Released,ReleaseSite_50,2000-12-22,False,...,False,False,False,False,False,False,False,False,False,False
1,2001-10-28,0,0,35.85,31.35,,Released,ReleaseSite_62,2001-10-28,False,...,False,False,False,False,False,False,False,False,False,False
2,2001-11-01,0,0,51.8,49.2,,Released,ReleaseSite_50,2001-01-11,False,...,False,False,False,False,False,False,False,False,False,False
3,2002-03-11,0,0,60.5,59.0,,Released,ReleaseSite_50,2002-11-03,False,...,False,False,False,False,False,False,False,False,False,False
4,2002-08-08,0,0,34.7,33.0,,Released,ReleaseSite_62,2002-08-08,True,...,False,False,False,False,False,False,False,False,False,False


In [49]:
df.isna().sum()

date_caught                   0
tag_2                         0
t_number                      0
ccl_cm                        0
ccw_cm                        0
                             ..
capture_site_CaptureSite_5    0
capture_site_CaptureSite_6    0
capture_site_CaptureSite_7    0
capture_site_CaptureSite_8    0
capture_site_CaptureSite_9    0
Length: 64, dtype: int64