In [108]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import os
import cv2
import random
import pickle

In [109]:
## Import and Convert images

data = []
labels = pd.read_csv("C:/Users/galan/Desktop/Stanford Classes/Quarter 9/Deep Learning/Project/train.csv")
labels["Id"] = labels["Id"].astype('category')
labels["Id_cat"] = labels["Id"].cat.codes                                              # Convert Id's to Numbers
id_label = labels[labels.columns[2]].tolist()                                          # Create a list with whale Id's
path = "C:/Users/galan/Desktop/Stanford Classes/Quarter 9/Deep Learning/Project/train" # Folder with Whale Images
img_size = 200                                                                         # Size that we will use to reshape images

def create_data():
    ii = 0
    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path,img), cv2.IMREAD_GRAYSCALE)   # import each image and convert to grayscale
        new_array = cv2.resize(img_array, (img_size, img_size))                # convert images to desired size
        data.append([new_array, id_label[ii]])                                 # append images and Id labels to data list
        ii += 1 
        
create_data()
    

In [110]:
## Split data for Training and Test Sets

random.shuffle(data)   # Shuffle Data
percentage = 0.8       # Decide percentage of Training set

train_number = int(round(percentage*len(data)))
train_data = data[0:train_number]
test_data = data[train_number:]

In [111]:
## Create Training and Test Sets

X_train = []
Y_tr = []
X_test = []
Y_te = []

for features, label in train_data:
    X_train.append(features)
    Y_tr.append(label)

for features, label in test_data:
    X_test.append(features)
    Y_te.append(label)
    
X_train = np.array(X_train).reshape(-1,img_size,img_size,1)  # Convert to Numpy array for your NN
X_test = np.array(X_test).reshape(-1,img_size,img_size,1)    # Convert to Numpy array for your NN


In [113]:
## Make Label Sets Numpy Arrays of ones and zeros

Y_train = np.zeros((len(Y_tr), max(id_label)+1))
for ii in range(Y_train.shape[0]):
    Y_train[ii][Y_tr[ii]] = 1
    
Y_test = np.zeros((len(Y_te), max(id_label)+1))
for ii in range(Y_test.shape[0]):
    Y_test[ii][Y_te[ii]] = 1

## Convert Unique IDs to New Whale labels    
    
Y = np.concatenate((Y_train, Y_test), axis=0)
        
Suma = np.sum(Y, axis = 0)
for jj in range(len(Suma)):
    if Suma[jj] < 2:
        itemindex = np.where(Y[:,jj]==1)
        Y[itemindex,jj] = 0
        Y[itemindex,0] = 1

## Create the final version of Training and Test Label numpy arrays        
        
Y_train = Y[0:train_number, :]
Y_test = Y[train_number:, :]

In [143]:
## Save Sets into files to save time next time!

pickle_out = open("X_train.pickle","wb")
pickle.dump(X_train, pickle_out)
pickle_out.close()

pickle_out = open("Y_train.pickle","wb")
pickle.dump(Y_train, pickle_out)
pickle_out.close()

pickle_out = open("X_test.pickle","wb")
pickle.dump(X_test, pickle_out)
pickle_out.close()

pickle_out = open("Y_test.pickle","wb")
pickle.dump(Y_test, pickle_out)
pickle_out.close()

pickle_out = open("labels.pickle","wb")
pickle.dump(labels, pickle_out)
pickle_out.close()

In [142]:
## Just Check Results of algorithm

print(Y_train.shape)
print(X_train.shape)
print(Y_test.shape)
print(X_test.shape)
print(max(id_label))

(20289, 5005)
(20289, 200, 200, 1)
(5072, 5005)
(5072, 200, 200, 1)
5004


In [125]:
unique, counts = np.unique(id_label, return_counts=True)
dict(zip(unique, counts))

{0: 9664,
 1: 1,
 2: 1,
 3: 10,
 4: 2,
 5: 1,
 6: 1,
 7: 8,
 8: 2,
 9: 1,
 10: 17,
 11: 11,
 12: 2,
 13: 1,
 14: 1,
 15: 6,
 16: 1,
 17: 1,
 18: 2,
 19: 5,
 20: 1,
 21: 2,
 22: 5,
 23: 23,
 24: 1,
 25: 1,
 26: 1,
 27: 2,
 28: 10,
 29: 1,
 30: 1,
 31: 3,
 32: 9,
 33: 1,
 34: 1,
 35: 1,
 36: 1,
 37: 6,
 38: 1,
 39: 1,
 40: 10,
 41: 2,
 42: 2,
 43: 2,
 44: 19,
 45: 2,
 46: 1,
 47: 4,
 48: 8,
 49: 5,
 50: 4,
 51: 1,
 52: 1,
 53: 2,
 54: 1,
 55: 2,
 56: 2,
 57: 8,
 58: 1,
 59: 2,
 60: 2,
 61: 2,
 62: 6,
 63: 20,
 64: 61,
 65: 3,
 66: 1,
 67: 1,
 68: 3,
 69: 1,
 70: 1,
 71: 4,
 72: 4,
 73: 1,
 74: 30,
 75: 2,
 76: 1,
 77: 4,
 78: 1,
 79: 1,
 80: 3,
 81: 2,
 82: 2,
 83: 1,
 84: 6,
 85: 2,
 86: 2,
 87: 2,
 88: 3,
 89: 2,
 90: 2,
 91: 1,
 92: 7,
 93: 2,
 94: 2,
 95: 1,
 96: 1,
 97: 1,
 98: 3,
 99: 2,
 100: 3,
 101: 3,
 102: 2,
 103: 1,
 104: 3,
 105: 1,
 106: 1,
 107: 1,
 108: 1,
 109: 1,
 110: 1,
 111: 3,
 112: 1,
 113: 1,
 114: 3,
 115: 2,
 116: 2,
 117: 4,
 118: 3,
 119: 4,
 120: 4,
 121: 2,

In [145]:
N = 10     # Test a label of your choice
print(np.sum(Y_train[:,N]) + np.sum(Y_test[:,N]))

17.0
