In [35]:
#importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score


# Reading Data from file

In [36]:
# Loading the dataset as a dataframe with erroneous column data types as str
df = pd.read_csv('Projects/MarineEcoSystem/global_bleaching_environmental.csv', dtype={13: str, 15: str, 24: str})
df.replace(to_replace='nd', value=None, inplace=True)
df.drop_duplicates(inplace=True)

# Exploring the Dataset

In [37]:
df.describe()

Unnamed: 0,Site_ID,Sample_ID,Latitude_Degrees,Longitude_Degrees,Cyclone_Frequency,Date_Day,Date_Month,Date_Year
count,41361.0,41361.0,41361.0,41361.0,41361.0,41361.0,41361.0,41361.0
mean,74558.16,10128800.0,7.558085,34.966127,52.15965,16.037402,6.902686,2007.796765
std,252041.8,1373151.0,15.732185,103.404598,7.589593,7.8374,2.875063,6.073043
min,1.0,9623.0,-30.2625,-179.9743,18.31,1.0,1.0,1980.0
25%,3502.0,10311080.0,-4.9025,-78.3856,47.94,10.0,5.0,2003.0
50%,5925.0,10316280.0,10.7761,96.8433,50.92,15.0,7.0,2007.0
75%,8368.0,10321490.0,20.0505,120.8804,55.73,22.0,9.0,2013.0
max,1000060.0,10331710.0,36.75,179.9645,105.8,31.0,12.0,2020.0


In [38]:
df.shape

(41361, 62)

In [39]:
df.head()

Unnamed: 0,Site_ID,Sample_ID,Data_Source,Latitude_Degrees,Longitude_Degrees,Ocean_Name,Reef_ID,Realm_Name,Ecoregion_Name,Country_Name,...,TSA_FrequencyMax,TSA_FrequencyMean,TSA_DHW,TSA_DHW_Standard_Deviation,TSA_DHWMax,TSA_DHWMean,Date,Site_Comments,Sample_Comments,Bleaching_Comments
0,2501,10324336,Donner,23.163,-82.526,Atlantic,,Tropical Atlantic,Cuba and Cayman Islands,Cuba,...,5,0,0.0,0.74,7.25,0.18,2005-09-15,,,
1,3467,10324754,Donner,-17.575,-149.7833,Pacific,,Eastern Indo-Pacific,Society Islands French Polynesia,French Polynesia,...,4,0,0.26,0.67,4.65,0.19,1991-03-15,The bleaching does not appear to have gained ...,The bleaching does not appear to have gained ...,
2,1794,10323866,Donner,18.369,-64.564,Atlantic,,Tropical Atlantic,Hispaniola Puerto Rico and Lesser Antilles,United Kingdom,...,7,0,0.0,1.04,11.66,0.26,2006-01-15,,,
3,8647,10328028,Donner,17.76,-64.568,Atlantic,,Tropical Atlantic,Hispaniola Puerto Rico and Lesser Antilles,United States,...,4,0,0.0,0.75,5.64,0.2,2006-04-15,,,
4,8648,10328029,Donner,17.769,-64.583,Atlantic,,Tropical Atlantic,Hispaniola Puerto Rico and Lesser Antilles,United States,...,5,0,0.0,0.92,6.89,0.25,2006-04-15,,,


In [40]:
df.tail()

Unnamed: 0,Site_ID,Sample_ID,Data_Source,Latitude_Degrees,Longitude_Degrees,Ocean_Name,Reef_ID,Realm_Name,Ecoregion_Name,Country_Name,...,TSA_FrequencyMax,TSA_FrequencyMean,TSA_DHW,TSA_DHW_Standard_Deviation,TSA_DHWMax,TSA_DHWMean,Date,Site_Comments,Sample_Comments,Bleaching_Comments
41356,15446,10310562,Reef_Check,-8.3651,116.0844,Pacific,116.5.3.9E.8.21.54.4S,Central Indo-Pacific,Lesser Sunda Islands and Savu Sea,Indonesia,...,8,1,2.09,1.49,9.0,0.63,2019-05-28,,,
41357,15456,10310527,Reef_Check,-8.3473,116.0503,Pacific,116.3.1.1E.8.20.50.2S,Central Indo-Pacific,Lesser Sunda Islands and Savu Sea,Indonesia,...,8,1,2.0,1.29,8.01,0.65,2019-05-16,,,
41358,15456,10310527,Reef_Check,-8.3473,116.0503,Pacific,116.3.1.1E.8.20.50.2S,Central Indo-Pacific,Lesser Sunda Islands and Savu Sea,Indonesia,...,8,1,2.0,1.29,8.01,0.65,2019-05-16,,,
41359,15457,10310536,Reef_Check,-8.3445,116.0629,Pacific,116.3.46.548E.8.20.40.236S,Central Indo-Pacific,Lesser Sunda Islands and Savu Sea,Indonesia,...,8,1,2.0,1.29,8.01,0.65,2019-05-29,,,
41360,15457,10310536,Reef_Check,-8.3445,116.0629,Pacific,116.3.46.548E.8.20.40.236S,Central Indo-Pacific,Lesser Sunda Islands and Savu Sea,Indonesia,...,8,1,2.0,1.29,8.01,0.65,2019-05-29,,,


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41361 entries, 0 to 41360
Data columns (total 62 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Site_ID                                41361 non-null  int64  
 1   Sample_ID                              41361 non-null  int64  
 2   Data_Source                            41361 non-null  object 
 3   Latitude_Degrees                       41361 non-null  float64
 4   Longitude_Degrees                      41361 non-null  float64
 5   Ocean_Name                             41361 non-null  object 
 6   Reef_ID                                28821 non-null  object 
 7   Realm_Name                             41361 non-null  object 
 8   Ecoregion_Name                         41358 non-null  object 
 9   Country_Name                           41360 non-null  object 
 10  State_Island_Province_Name             41262 non-null  object 
 11  Ci

In [42]:
df.columns

Index(['Site_ID', 'Sample_ID', 'Data_Source', 'Latitude_Degrees',
       'Longitude_Degrees', 'Ocean_Name', 'Reef_ID', 'Realm_Name',
       'Ecoregion_Name', 'Country_Name', 'State_Island_Province_Name',
       'City_Town_Name', 'Site_Name', 'Distance_to_Shore', 'Exposure',
       'Turbidity', 'Cyclone_Frequency', 'Date_Day', 'Date_Month', 'Date_Year',
       'Depth_m', 'Substrate_Name', 'Percent_Cover', 'Bleaching_Level',
       'Percent_Bleaching', 'ClimSST', 'Temperature_Kelvin',
       'Temperature_Mean', 'Temperature_Minimum', 'Temperature_Maximum',
       'Temperature_Kelvin_Standard_Deviation', 'Windspeed', 'SSTA',
       'SSTA_Standard_Deviation', 'SSTA_Mean', 'SSTA_Minimum', 'SSTA_Maximum',
       'SSTA_Frequency', 'SSTA_Frequency_Standard_Deviation',
       'SSTA_FrequencyMax', 'SSTA_FrequencyMean', 'SSTA_DHW',
       'SSTA_DHW_Standard_Deviation', 'SSTA_DHWMax', 'SSTA_DHWMean', 'TSA',
       'TSA_Standard_Deviation', 'TSA_Minimum', 'TSA_Maximum', 'TSA_Mean',
       'TSA_Freq

---
# Data Preprocessing


In [43]:
# Dropping unnecessary columns with useless data or too many missing values

drop_columns = [
    'Reef_ID', 'Data_Source', 'Site_ID', 'Site_Name', 'Site_Comments', 'Sample_Comments', 'Bleaching_Comments', 'Bleaching_Level',
    'Date_Day', 'Date_Month', 'Date_Year', 'Country_Name', 'State_Island_Province_Name', 'Temperature_Mean', 'Temperature_Minimum', 
    'Temperature_Maximum', 'Temperature_Kelvin_Standard_Deviation', 'SSTA_Standard_Deviation', 'SSTA_Mean', 'SSTA_Minimum', 
    'SSTA_Maximum', 'SSTA_Frequency', 'SSTA_Frequency_Standard_Deviation', 'SSTA_FrequencyMax', 'SSTA_FrequencyMean', 'SSTA_DHW', 
    'SSTA_DHW_Standard_Deviation', 'SSTA_DHWMax', 'SSTA_DHWMean', 'TSA_Standard_Deviation', 'TSA_Minimum', 'TSA_Maximum', 'TSA_Mean',
    'TSA_Frequency', 'TSA_Frequency_Standard_Deviation', 'TSA_FrequencyMax', 'TSA_FrequencyMean', 'TSA_DHW', 'TSA_DHW_Standard_Deviation', 
    'TSA_DHWMax', 'TSA_DHWMean', 'Realm_Name', 'Ecoregion_Name', 'City_Town_Name', 'Substrate_Name',
    'Percent_Cover', 'Depth_m', 'Sample_ID', 'SSTA', 'TSA', 'Windspeed'
]
for column in drop_columns:
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)

 Checking for missing values

In [44]:
df.isna().sum()

Latitude_Degrees         0
Longitude_Degrees        0
Ocean_Name               0
Distance_to_Shore        2
Exposure                 0
Turbidity                6
Cyclone_Frequency        0
Percent_Bleaching     6846
ClimSST                113
Temperature_Kelvin     148
Date                     0
dtype: int64

In [45]:
#Dropping rows containing missing values
loc_columns = ['Ocean_Name', 'Percent_Bleaching']
df.dropna(subset=loc_columns, inplace=True)

In [46]:
df.isna().sum()

Latitude_Degrees        0
Longitude_Degrees       0
Ocean_Name              0
Distance_to_Shore       2
Exposure                0
Turbidity               6
Cyclone_Frequency       0
Percent_Bleaching       0
ClimSST                95
Temperature_Kelvin    122
Date                    0
dtype: int64

In [47]:
df = df.sort_values('Date')
fill_columns = ['Ocean_Name', 'ClimSST', 'Temperature_Kelvin', 'Turbidity']
df[fill_columns] = df.groupby('Ocean_Name')[fill_columns].apply(lambda group: group.fillna(method='ffill')).reset_index(level=0, drop=True)
df[fill_columns] = df.groupby('Ocean_Name')[fill_columns].apply(lambda group: group.fillna(method='bfill')).reset_index(level=0, drop=True)
df.isna().sum()


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Latitude_Degrees      0
Longitude_Degrees     0
Ocean_Name            0
Distance_to_Shore     2
Exposure              0
Turbidity             0
Cyclone_Frequency     0
Percent_Bleaching     0
ClimSST               0
Temperature_Kelvin    0
Date                  0
dtype: int64

In [48]:
df['Distance_to_Shore'] = pd.to_numeric(df['Distance_to_Shore'], errors='coerce')
df['Distance_to_Shore'] = df['Distance_to_Shore'].fillna(value=df['Distance_to_Shore'].mean())

In [49]:
df.isna().sum()

Latitude_Degrees      0
Longitude_Degrees     0
Ocean_Name            0
Distance_to_Shore     0
Exposure              0
Turbidity             0
Cyclone_Frequency     0
Percent_Bleaching     0
ClimSST               0
Temperature_Kelvin    0
Date                  0
dtype: int64

**All null values removed**

---

In [50]:
df.dtypes

Latitude_Degrees      float64
Longitude_Degrees     float64
Ocean_Name             object
Distance_to_Shore     float64
Exposure               object
Turbidity              object
Cyclone_Frequency     float64
Percent_Bleaching      object
ClimSST                object
Temperature_Kelvin     object
Date                   object
dtype: object

---

**converting the categorical values into numerical representations**

In [51]:
le = LabelEncoder()

In [52]:
cat_columns = ['Ocean_Name', 'Exposure']
for column in cat_columns:
    df[column] = le.fit_transform(df[column])

In [53]:
object_columns = ['ClimSST', 'Temperature_Kelvin', 'Turbidity', 'Percent_Bleaching']
for column in object_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [54]:
df['Date'] = pd.to_datetime(df['Date'])

In [55]:
df.dtypes

Latitude_Degrees             float64
Longitude_Degrees            float64
Ocean_Name                     int64
Distance_to_Shore            float64
Exposure                       int64
Turbidity                    float64
Cyclone_Frequency            float64
Percent_Bleaching            float64
ClimSST                      float64
Temperature_Kelvin           float64
Date                  datetime64[ns]
dtype: object

In [56]:
df.head()

Unnamed: 0,Latitude_Degrees,Longitude_Degrees,Ocean_Name,Distance_to_Shore,Exposure,Turbidity,Cyclone_Frequency,Percent_Bleaching,ClimSST,Temperature_Kelvin,Date
1402,-27.1667,-109.3333,3,1579.03,0,0.0214,50.08,75.0,294.65,303.27,1980-06-15
3748,-27.1,-109.3,3,442.79,0,0.0219,50.08,30.5,262.15,303.27,1980-06-15
1736,7.8128,-81.7597,3,8.27,1,0.0537,50.0,75.0,302.02,303.27,1983-01-15
1737,7.855,-82.0125,3,9816.94,0,0.0672,49.89,75.0,302.17,303.19,1983-01-15
1415,5.55,-87.06,3,91.97,0,0.0404,49.74,75.0,300.97,302.82,1983-01-15


---

**Applying Z-Score Normalization**

In [57]:
norm_columns = ['Distance_to_Shore', 'Cyclone_Frequency']
for column in norm_columns:
    df[column] = (df[column] - df[column].mean())/df[column].std()

---

**Preparing the data for Machine learning modeling**

In [58]:
X = df.loc[:, ~df.columns.isin(['Ocean_Name', 'Date', 'Latitude_Degrees', 'Longitude_Degrees'])]
y = df['Ocean_Name']

---

**Spliting the dataset into Training and Testing sets**

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

---
**Train the machine learning model using the Training data**

In [60]:
clf = DecisionTreeClassifier(random_state=42)

In [61]:
clf.fit(X_train, y_train)

In [62]:
#predicts the target variable (y_pred) based on the features (X_test) using the trained machine learning model (clf)
y_pred = clf.predict(X_test)

---
**The accuracy metrices of the machine learning model :-**

In [63]:
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}")

Accuracy: 96.83


---

In [64]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n")
print(conf_matrix)

Confusion Matrix:

[[  73    0    0    3    0]
 [   0 2557    7   88    3]
 [   3    6  449   15    0]
 [   1   67   21 3417    0]
 [   0    5    0    0  188]]


---

In [65]:
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision:.2f}")

Precision: 0.97


---

In [66]:
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.2f}")

Recall: 0.97


---

 # Density Map Visualization of Percent Bleaching Events

In [68]:
fig = px.density_mapbox(df, lat='Latitude_Degrees', lon='Longitude_Degrees', z='Percent_Bleaching', radius=10,
                        center=dict(lat=0, lon=180), zoom=0,
                        mapbox_style="open-street-map")

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()