# Galaxy Recognition challenge

In [9]:
import os
import glob
from tqdm import tqdm

import numpy as np
import pandas as pd
import xarray as xr

from skimage.transform import resize
from skimage import io

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
!unzip -q /kaggle/input/galaxy-zoo-the-galaxy-challenge/images_training_rev1.zip
!unzip -q /kaggle/input/galaxy-zoo-the-galaxy-challenge/training_solutions_rev1.zip
!unzip -q /kaggle/input/galaxy-zoo-the-galaxy-challenge/images_test_rev1.zip

In [None]:
#!rm training_solutions_rev1.zip
#!rm images_training_rev1.zip
#!rm -r sample_data

# Data Analysis

In [5]:
df = pd.read_csv('/kaggle/working/training_solutions_rev1.csv')

In [6]:
df

In [8]:
# Based on https://www.kaggle.com/helmehelmuto/keras-cnn
# but uses float32 to save memory/space

ORIG_SHAPE = (424,424)
CROP_SIZE = (256,256)
IMG_SHAPE = (64,64)

def get_image(path, x1,y1, shape, crop_size):
    x = plt.imread(path)
    x = x[x1:x1+crop_size[0], y1:y1+crop_size[1]]
    x = resize(x, shape)
    return x
    
def get_train_data(dataframe, shape=IMG_SHAPE, crop_size=CROP_SIZE):
    x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2
    y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2
   
    sel = dataframe.values
    ids = sel[:,0].astype(int).astype(str)
    y_batch = sel[:,1:].astype(np.float32)
    x_batch = []
    for i in tqdm(ids):
        x = get_image('/kaggle/working/images_training_rev1/'+i+'.jpg', x1, y1, shape=shape, crop_size=crop_size)
        x_batch.append(x.astype(np.float32))
    x_batch = np.array(x_batch)
    return x_batch, y_batch

%time x, y = get_train_data(df)

In [9]:
# Don't change the numbers in this command!
x, x_test, y, y_test = train_test_split(x, y, train_size=0.89391, random_state=99765749)

In [10]:
print("shape of the data array: {}".format(x.shape))
print("shape of the label array: {}".format(y.shape))

In [11]:
plt.pcolormesh(x[4,:,:,1])
plt.colorbar()

In [12]:
plt.imshow(x[4])
plt.show()

---
## Test metric definition
Before we let you start coding stuff, we will define a scoring function. If you want to use this same metric for the optimisation of the network: it is called the *Root Mean Squared Error*.

In [13]:
def score(y_prediction):
  return np.sqrt(np.mean( np.power(y_prediction - y_test, 2) ))

In [2]:
# Imports first
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, Flatten, LeakyReLU, BatchNormalization,GlobalMaxPooling2D
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l1,l2


In [4]:
model = Sequential()

#model.add(Conv2D(32,3,activation='relu', input_shape = (x.shape[1],x.shape[2],x.shape[3]),kernel_regularizer='l2'))
model.add(Conv2D(32,3,activation='relu', input_shape = (64,64,3),kernel_regularizer='l2'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))

model.add(Conv2D(64,3,activation='relu',kernel_regularizer='l2'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))

model.add(Conv2D(128,3,activation='relu',kernel_regularizer='l2'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2))

model.add(Conv2D(512,3,activation='relu',kernel_regularizer='l2'))
model.add(BatchNormalization())

#model.add(Conv2D(512,(4,4),activation='relu'))
model.add(GlobalMaxPooling2D())

#model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(37,activation= 'sigmoid'))

In [5]:
model.summary()

In [15]:
model.compile(optimizer='adam',loss='mse',metrics=['RootMeanSquaredError'])
model.save_weights('/kaggle/working/checkpoints/my_checkpoint')

In [16]:
history = model.fit(x,y,batch_size=128, epochs=7, validation_data=(x_test,y_test))

In [17]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train', 'val'], loc='upper right')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

In [18]:
plt.plot(history.history['root_mean_squared_error'])
plt.plot(history.history['val_root_mean_squared_error'])
plt.title('model loss')
plt.ylabel('loss')
plt.legend(['train', 'val'], loc='upper right')
plt.xlabel('epoch')
plt.show()

In [6]:
model.load_weights('/kaggle/working/checkpoints/my_checkpoint')

In [None]:
image_name_list = os.listdir('/kaggle/working/images_test_rev1')
ORIG_SHAPE = (424,424)
CROP_SIZE = (256,256)
IMG_SHAPE = (64,64)

def get_image(path, x1,y1, shape, crop_size):
    x = plt.imread(path)
    x = x[x1:x1+crop_size[0], y1:y1+crop_size[1]]
    x = resize(x, shape)
    return x

id_list=[]
x_batch =[]
x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2
y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2

for f in image_name_list:
    
    x = get_image('/kaggle/working/images_test_rev1/'+f, x1, y1, shape=IMG_SHAPE, crop_size=CROP_SIZE)
    x_batch.append(x.astype(np.float32))
    id_list.append(int(os.path.splitext(f)[0]))
        
id_list = np.array(id_list)
x_batch= np.array(x_batch)
x_batch[0].shape

In [7]:
y_prediction = model.predict([x_batch])

In [None]:
#gid =id_list.reshape(len(id_list),1)
#print(gid.shape)

In [None]:
#fres = np.concatenate((gid,y_prediction))
fres = np.column_stack((id_list,y_prediction))
fres.dtype

In [None]:

result= pd.DataFrame(fres, columns=df.columns)
result =result.astype({"GalaxyID": int}, errors='raise')
result.sort_values(by=['GalaxyID'])

In [None]:
result.to_csv(index=False)