# DEEP SEA CORALS PROJECT
***

# Goals
***

- Explore data to gather insights about coral

# Acquire
Acquiring the data from local csv file
***

In [44]:
# establishing environment
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [45]:
# importing data
df = pd.read_csv('deep_sea_corals.csv')

In [46]:
# previewing data
df.head()

Unnamed: 0,CatalogNumber,DataProvider,ScientificName,VernacularNameCategory,TaxonRank,Station,ObservationDate,latitude,longitude,DepthInMeters,DepthMethod,Locality,LocationAccuracy,SurveyID,Repository,IdentificationQualifier,EventID,SamplingEquipment,RecordType,SampleID
0,,,,,,,,degrees_north,degrees_east,,,,,,,,,,,
1,625366.0,"NOAA, Deep Sea Coral Research & Technology Pro...",Madrepora oculata,stony coral (branching),species,D2-EX1504L3-05,2015-09-02,18.30817,-158.45392,959.0,reported,"Hawaiian Archipelago, Swordfish Seamount",50m,Hohonu Moana: Exploring Deep Waters off Hawai'i,University of Hawaii,ID by expert from video,D2-EX1504L3-05,ROV,video observation,EX1504L3_05_20150901T181522Z.mp4_05:45:26:28
2,625373.0,"NOAA, Deep Sea Coral Research & Technology Pro...",Madrepora oculata,stony coral (branching),species,D2-EX1504L3-05,2015-09-01,18.30864,-158.45393,953.0,reported,"Hawaiian Archipelago, Swordfish Seamount",50m,Hohonu Moana: Exploring Deep Waters off Hawai'i,University of Hawaii,ID by expert from video,D2-EX1504L3-05,ROV,video observation,EX1504L3_05_20150901T181522Z.mp4_05:24:35:53
3,625386.0,"NOAA, Deep Sea Coral Research & Technology Pro...",Madrepora oculata,stony coral (branching),species,D2-EX1504L3-05,2015-09-01,18.30877,-158.45384,955.0,reported,"Hawaiian Archipelago, Swordfish Seamount",50m,Hohonu Moana: Exploring Deep Waters off Hawai'i,University of Hawaii,ID by expert from video,D2-EX1504L3-05,ROV,video observation,EX1504L3_05_20150901T181522Z.mp4_05:15:22:09
4,625382.0,"NOAA, Deep Sea Coral Research & Technology Pro...",Madrepora oculata,stony coral (branching),species,D2-EX1504L3-05,2015-09-01,18.30875,-158.45384,955.0,reported,"Hawaiian Archipelago, Swordfish Seamount",50m,Hohonu Moana: Exploring Deep Waters off Hawai'i,University of Hawaii,ID by expert from video,D2-EX1504L3-05,ROV,video observation,EX1504L3_05_20150901T181522Z.mp4_05:13:29:50


# Prepare
Preparing the data for exploration and modeling
***

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513373 entries, 0 to 513372
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CatalogNumber            513372 non-null  float64
 1   DataProvider             513372 non-null  object 
 2   ScientificName           513372 non-null  object 
 3   VernacularNameCategory   513197 non-null  object 
 4   TaxonRank                513364 non-null  object 
 5   Station                  253590 non-null  object 
 6   ObservationDate          513367 non-null  object 
 7   latitude                 513373 non-null  object 
 8   longitude                513373 non-null  object 
 9   DepthInMeters            513372 non-null  float64
 10  DepthMethod              496845 non-null  object 
 11  Locality                 389645 non-null  object 
 12  LocationAccuracy         484662 non-null  object 
 13  SurveyID                 306228 non-null  object 
 14  Repo

- Drop columns that will not be used in this iteration of the project
    - CatalogNumber, SampleID, SurveyID, EventID, and Station
        - Categorical columns with vast amounts of unique values that don't offer insight to anything within the scope of this project
    - Locality
        - Column holds a very large amount of categorical values
        - Will be easier to work with if I bin the values as many appear to be near eachother but I'll save this for a later iteration of the project since it may take a significant amount of time

     
     
- Many null values
    - I'll drop them after dropping columns I don't plan to use for this first iteration of this project
        - If too many rows are lost I'll impute values to preserve more rows 
        
        
- Data types look okay for now but I'll update if needed to facilitate operations


- Rename columns 
    - all lowercase
    - "_" between words in names


- Make all values lowercase where applicable

### Dropping Columns

In [48]:
# dropping specified columns
df = df.drop(columns = ['CatalogNumber', 'SampleID', 'SurveyID', 'EventID', 'Station', 'Locality'])

### Dropping Nulls

In [49]:
# dropping all null values
df = df.dropna()

### Renaming Columns

In [51]:
# adding underscores to various column names
df.columns = ['Data_Provider', 'Scientific_Name', 'Vernacular_Name_Category', 'Taxon_Rank',
              'Observation_Date', 'latitude', 'longitude', 'Depth_Meters','Depth_Method', 
              'Location_Accuracy', 'Repository', 'Identification_Qualifier', 'Sampling_Equipment',
              'Record_Type']

# lower casing all column names
df.columns = df.columns.str.lower()

### Converting all values to lower case

In [74]:
df = df.applymap(lambda string:string.lower() if type(string) == str else string)

# Explore
Exploring the data to draw insights about the corals
***

In [75]:
df.head()

Unnamed: 0,data_provider,scientific_name,vernacular_name_category,taxon_rank,observation_date,latitude,longitude,depth_meters,depth_method,location_accuracy,repository,identification_qualifier,sampling_equipment,record_type
1,"noaa, deep sea coral research & technology pro...",madrepora oculata,stony coral (branching),species,2015-09-02,18.30817,-158.45392,959.0,reported,50m,university of hawaii,id by expert from video,rov,video observation
2,"noaa, deep sea coral research & technology pro...",madrepora oculata,stony coral (branching),species,2015-09-01,18.30864,-158.45393,953.0,reported,50m,university of hawaii,id by expert from video,rov,video observation
3,"noaa, deep sea coral research & technology pro...",madrepora oculata,stony coral (branching),species,2015-09-01,18.30877,-158.45384,955.0,reported,50m,university of hawaii,id by expert from video,rov,video observation
4,"noaa, deep sea coral research & technology pro...",madrepora oculata,stony coral (branching),species,2015-09-01,18.30875,-158.45384,955.0,reported,50m,university of hawaii,id by expert from video,rov,video observation
5,"noaa, deep sea coral research & technology pro...",madrepora oculata,stony coral (branching),species,2015-09-01,18.30902,-158.45425,968.0,reported,50m,university of hawaii,id by expert from video,rov,video observation


In [53]:
df.data_provider.value_counts()

Monterey Bay Aquarium Research Institute                                                           195361
NOAA, Alaska Fisheries Science Center                                                               74239
NOAA, Southwest Fisheries Science Center, Santa Cruz                                                43573
NOAA, Olympic Coast National Marine Sanctuary                                                       36009
Hawaii Undersea Research Laboratory                                                                 32996
NOAA, Office of Ocean Exploration and Research                                                      16053
Temple University                                                                                   10893
Harbor Branch Oceanographic Institute                                                                9474
NOAA, Southwest Fisheries Science Center, La Jolla                                                   7729
NOAA, Northwest Fisheries Science Center      

In [54]:
df.scientific_name.value_counts()

Porifera                               74411
Heteropolypus ritteri                  21395
Pennatulacea                           17464
Stylaster sp.                          14327
Hexactinellida                         10280
                                       ...  
cf. Kophobelemnidae                        1
Telestula tubaria                          1
Caryophyllia (Caryophyllia) alberti        1
Hyalonema (Coscinonema) schmidti           1
Errina laevigata                           1
Name: scientific_name, Length: 1723, dtype: int64

In [55]:
df.vernacular_name_category.value_counts()

gorgonian coral               132830
sponge (unspecified)           74411
sea pen                        63154
glass sponge                   54349
soft coral                     37769
demosponge                     35027
black coral                    21481
lace coral                     19496
stony coral (branching)        12627
stony coral (cup coral)         5599
stony coral (unspecified)       3979
gold coral                      2599
stoloniferan coral               854
calcareous sponge                766
scleromorph sponge               258
other coral-like hydrozoan       140
lithotelestid coral                1
Name: vernacular_name_category, dtype: int64

In [56]:
df.taxon_rank.value_counts()

genus         161831
species       142399
phylum         74411
order          38869
family         29036
class          13570
subfamily       3709
subspecies       510
subgenus         467
variety          243
suborder         217
subclass          74
forma              4
Name: taxon_rank, dtype: int64

In [61]:
df.observation_date.value_counts()

2008-07-11    12995
2007-06-18     6897
2007-06-19     6588
2003-10-14     6458
2008-07-13     6226
              ...  
1957-08-13        1
1908-02-07        1
1976-05-12        1
2003-05-23        1
1975-10-19        1
Name: observation_date, Length: 7819, dtype: int64

In [62]:
df.latitude.value_counts()

25.57430    605
21.29053    494
33.70941    440
23.97940    439
23.23322    431
           ... 
35.13839      1
38.02983      1
56.68111      1
36.68493      1
20.93203      1
Name: latitude, Length: 123699, dtype: int64

In [63]:
df.longitude.value_counts()

-173.50637    605
-157.55       553
-122.72358    552
-157.53542    494
-166.7359     439
             ... 
-122.08208      1
-124.91113      1
177.6637        1
-122.24217      1
-93.87384       1
Name: longitude, Length: 127739, dtype: int64

In [64]:
df.depth_meters.value_counts()

113.0     5257
112.0     4382
111.0     3497
506.0     2369
1284.0    2157
          ... 
4594.0       1
3537.0       1
3825.0       1
4745.0       1
4335.0       1
Name: depth_meters, Length: 4009, dtype: int64

In [65]:
df.depth_method.value_counts()

reported    396335
averaged     64328
assigned      4569
maximum        108
Name: depth_method, dtype: int64

In [67]:
df.location_accuracy.value_counts()

100m      266567
>1000m     92765
20m        49945
50m        32955
500m       17441
1000m       5667
Name: location_accuracy, dtype: int64

In [68]:
df.repository.value_counts()

Monterey Bay Aquarium Research Institute                                                                               195360
NOAA, Alaska Fisheries Science Center                                                                                   64762
NOAA, Olympic Coast National Marine Sanctuary                                                                           36009
NOAA, Southwest Fisheries Science Center                                                                                35762
Hawaii Undersea Research Laboratory, University of Hawaii                                                               32996
University of Hawaii                                                                                                    17299
NOAA, Southwest Fisheries Science Center, Santa Cruz                                                                    15621
Temple University                                                                                                     

In [69]:
df.identification_qualifier.value_counts()

ID by expert from video                                              139857
ID from video                                                        123765
ID by expert from image                                               44308
ID by non-expert from trawl survey                                    24436
Field ID by non-expert from video                                     22512
good                                                                  18958
ID by non-expert from video                                           17062
ID by expert from still images                                        14504
field ID by non-expert from trawl survey bycatch specimen             13522
ID by non-expert from sample                                           9460
good - ID from video                                                   8334
morphological ID by taxonomic expert                                   5570
non-expert                                                             4583
ID by expert

In [70]:
df.sampling_equipment.value_counts()

ROV               319482
submersible        66378
trawl              49354
towed camera       16039
longline            9463
AUV                 2535
drop camera         1182
dredge               692
net                   99
multiple gears        78
SCUBA                 10
grab                  10
trap                   8
pot                    5
corer                  2
other                  2
hook and line          1
Name: sampling_equipment, dtype: int64

In [72]:
df.record_type.value_counts()

video observation              362038
catch record                    41431
still image                     40535
specimen                        21149
notation                          182
literature                          4
specimen; video observation         1
Name: record_type, dtype: int64