# Exploring the data and Visualizing

# * HAPPYWHALE COMPETITION *
# 🐳🐋 WHALES AND DOLPHINS 🐬

*The goal of this project is to automate identification of individual whales and dolphins.*

______
## 📚 Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2 # work with images
import matplotlib.pyplot as plt #visualization
import seaborn as sns #visualization
import os #path finder

from skimage import io
from skimage.color import rgb2gray
import plotly.express as px

------


## 🛤 Find Path of Data

In [None]:
global_path = "/kaggle/input/happy-whale-and-dolphin/"
os.listdir(global_path)

In [None]:
sample_submission_path = "/kaggle/input/happy-whale-and-dolphin/sample_submission.csv"
train_images_path = "/kaggle/input/happy-whale-and-dolphin/train_images/"
train_path = "/kaggle/input/happy-whale-and-dolphin/train.csv"
test_images_path = "/kaggle/input/happy-whale-and-dolphin/test_images/"

------


## 📩 Import Data and Inital Look 👀

In [None]:
train_data = pd.read_csv(train_path)
train_data.head()
                        

💡 Let's look at what we are striving to achieve for the final submission. 

In [None]:
sample_submission = pd.read_csv(sample_submission_path)
sample_submission.head()

💡 It looks like we need a dataframe for each image with 5 predictions, including a new label option ('new_individual').

In [None]:
sample_submission.predictions[1]

In [None]:
train_data.info()

💡 We see there are 30 unique species. However, there are also some misspellings. We will fix the misspelling errors. We also notice that 'globis' is a term for a pilot whale so we will also group those in one. 
 
💡 It is also important to note that killer whales and pilot whales (including short-finned and long-finned pilot whales) are actually a type of dolphin.

In [None]:
# find unique species
print(train_data['species'].unique())

💡 On a discussion board in Kaggle, the competition host clarifies the species (which can be found here: https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/305468)

💡  "long_finned_pilot_whale and short_finned_pilot_whale are very similar species where we would expect more variation between individuals than between species. pilot_whale and globis are both short_finned_pilot_whale and thus the three can be merged"


In [None]:
# fix misspellings
train_data['species'].replace({'bottlenose_dolpin':'bottlenose_dolphin',  #missing the 'h' in dolphin
                               'kiler_whale': 'killer_whale',             #missing the 'l' in killer
                               'globis':'short_finned_pilot_whale',       #correcting species names
                               'pilot_whale':'short_finned_pilot_whale'}, #correcting species names
                              inplace=True)

In [None]:
# actual number of unique speceis
print(f'Number of Species in set: {train_data["species"].nunique()}')

💡 We see that some species have many more recording than others. Specifically, the bottlenose dolphin has the most photos while the fraisers dolphin has the least. 

💡 We also see that some individuals have been recorded many times (up to 400 images!) while other individuals have only been recorded once. This is important to note for later on when we are training the model.

In [None]:
print('Species Counts')
print(train_data['species'].value_counts())

In [None]:
print("Individual ID Counts")
print(train_data['individual_id'].value_counts())

In [None]:
print('Number of Missing Data:')
train_data.isna().sum()

------


# 🔍 EXPLORING THE DATA 

## 📷 Visualizations

💡 We can visualize the counts of each species. 

In [None]:
fig = plt.figure(figsize=(16, 5))
sns.countplot(x=train_data['species'],
            order=train_data['species'].value_counts().index).set(title='Species Counts')
plt.xticks(rotation=90);

💡 Now let's visualize whales and dolphin and compare the spread. It is important to note that **beluga's are whales** and that the following animals are actually part of the dolphin family:
1. Killer whale
2. False killer whale
3. Pigmy killer whale
4. Melon-headed whale
5. Southern Right whale
6. Short finned pilot whale
7. Long finned pilot whale

Therefore, we will create a new column and categorize the animals. Then we will visualize them.

In [None]:
dolphins = ['melon_headed_whale', 'false_killer_whale',
            'bottlenose_dolphin', 'southern_right_whale',
            'common_dolphin', 'killer_whale', 'short_finned_pilot_whale',
            'dusky_dolphin', 'long_finned_pilot_whale', 'sei_whale',
            'spinner_dolphin', 'spotted_dolphin',
            'commersons_dolphin', 'white_sided_dolphin',
            'rough_toothed_dolphin', 'pantropic_spotted_dolphin',
            'pygmy_killer_whale', 'frasiers_dolphin']

whales = ['humpback_whale','beluga', 'minke_whale', 'fin_whale','blue_whale', 'gray_whale',
         'cuviers_beaked_whale', 'brydes_whale']

train_data['family'] = 'dolphin'
for index in range(len(train_data)):
    if train_data.species[index] in whales:
        train_data.family[index] = 'whale'

In [None]:
fig = plt.figure(figsize=(16, 5))
sns.countplot(x=train_data['species'],
              order=train_data['species'].value_counts().index, 
              hue=train_data['family']).set(title='Species Counts by Family')
plt.xticks(rotation=90);

In [None]:
fig = px.pie(train_data, values = train_data['family'].value_counts().values, names = train_data['family'].unique())
fig.show()

💡 Below we visualize the shape of the image array and the actual image. 


In [None]:
first_file = train_data.image[0]
image = io.imread(train_images_path+first_file, cv2.IMREAD_GRAYSCALE)
plt.imshow(image)


#show array shape
print(f'Shape: {image.shape}')

In [None]:
fig, ax = plt.subplots(5, 5, figsize=(10, 10))

for i, axi in enumerate(ax.flat):
    file = train_data.image[i]
    image = io.imread(train_images_path+file)
    axi.imshow(image)
    axi.set(xticks=[], yticks=[], xlabel = train_data.species[i]);
    cv2.waitKey(0)

💡 Let's make a function to change the images. First, we will open the image. Then, we will increase the contrast. We increase the contrast to improve the edge detector function output. Then we run the Canny Edge Detector function on the image. When the edges (ie fin) is detected we "zoom in" to that part of the image. Then we reshape the image to be 300x200 pixels for conguency across all images. 

In [None]:
#change color scheme --> black and white 
#canny edges
#change resolution (200x300)


def modify_image(df, i, width=300, height=200):
    '''
    Helper function that will change a given picture to a grayscale as well as resize it to 300x200
    input:
    df: the dataframe in which the pictures are stored
    width, height: the resize dimensions, 300x200 by default    
    '''
    
    image_name = df.image[i]
    image = io.imread(train_images_path+image_name)
    
    # if the image has 3 color layers, convert to one layer of black-white
    if len(image.shape) == 3:
        grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        grey_image = cv2.bitwise_not(image)
        
    # increase contrast to better find edges
    contrast_image = cv2.convertScaleAbs(grey_image, alpha=1.3, beta=0)

    #find Canny Edge Detection
    canny_image = cv2.Canny(contrast_image, 175, 100)

    #zoom-in/crop image to detection area (essentially centers the dectection area)
    point = np.argwhere(canny_image>0)
    try:
        y1, x1 = point.min(axis=0)
        y2, x2 = point.max(axis=0)
        cropped_image = grey_image[y1:y2, x1:x2]
    except ValueError:
        cropped_image = grey_image

    # crop image to 200x300
    dimensions = (width, height)
    resized_image = cv2.resize(cropped_image, dimensions, interpolation=cv2.INTER_AREA) / 255 #resize and scale(normalize)

    return resized_image 

In [None]:
modify_image(train_data, 38)
modify_image(train_data, 5)

In [None]:
#change color scheme --> black and white 
#change resolution (200x300)

def modify_image_tester(df, i, width=300, height=200):
    
    image_name = df.image[i]
    image = io.imread(train_images_path+image_name)
    
    if len(image.shape) == 3:
        grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        grey_image = cv2.bitwise_not(image)
        
    # increase contrast to better find edges
    contrast_image = cv2.convertScaleAbs(grey_image, alpha=1.3, beta=0)

    # crop image to 200x300
    dimensions = (width, height)
    resized_image = cv2.resize(contrast_image, dimensions, interpolation=cv2.INTER_AREA) / 255 #resize and scale(normalize)
    return resized_image 

💡 Let's make sure the function is working properly. Great! 

In [None]:
fig, ax = plt.subplots(4, 4, figsize=(20, 15))

for i, axi in enumerate(ax.flat):
    image = modify_image_tester(train_data, i)
    axi.imshow(image)
    axi.set(xticks=[], yticks=[], xlabel = train_data.individual_id[i], title = train_data.species[i]);
    cv2.waitKey(0)


---