# Script for generating the training and testing data sets for desirable galaxy types

The script follows after script **02_Reading_ProcessingSDSSdata.ipynb** and uses the data files generated there. It can be run locally on your computer after obtaining the data sets remotely from *lesta*.

**Data**: 11th Nov, 2019 <br>
**Author**: Soumya Shreeram <br>
**Guidance form**:Anand Raichoor <br>
**Script motivated from:** S. Ben Nejma


In [2]:
import astropy.io.fits as fits
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 20})
import numpy as np
from numpy.lib.format import open_memmap
import os, sys
import subprocess
from astropy.convolution import convolve, Box1DKernel
import random

## 1. Defining the input parameters

In [3]:
# setting the right path for the directory with the data
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
data_dir = os.path.join(root_dir, "Data_files\\")

# ratio with which the data is separated for training and testing
ratio = 0.7

In [22]:
def setName(data_dir, filename):
    filename = filename+'.npy'
    return os.path.join(data_dir, filename)

def writeOutputToFile(input_name, data_dir, shape_arr, in_dtype):
    """
    Write to a .npy file as a memory-mapped array
    @param input_name :: array name
    @param shape_arr :: shape of the array to be memory-mapped
    
    @return output_arr :: the memory-mapped array
    """
    filename = input_name+'.npy'
    filename = os.path.join(data_dir, filename)
    w1 = open_memmap(filename, dtype=in_dtype, mode='w+', shape=shape_arr)
    return w1


## 2. Preparation of training and testing data sets

In [23]:
def trainIndicies(Y, ratio):
    # lists of the different labels, and aranging indicies
    categories = np.unique(Y).astype(int)
    indicies = np.arange(len(Y))

    # minimum no. of samples to choose/catergory
    min_samples = np.array([int(ratio*len(Y[Y == i])) for i in categories])

    #  list of indicies for every target type, and shuffling them at random
    category_indicies = [indicies[Y == i] for i in categories]
    for i in categories:
        random.shuffle(category_indicies[i])

    # indexes to use for training    
    indexes_train_interm = [category_indicies[i][:min_samples[i]]
                                for i in categories]
    indexes_train = np.array([idx for categories in indexes_train_interm
                              for idx in categories])
    return indexes_train, np.sum(min_samples)

def generateTrainTestFiles(len_train, X, data_dir):
    """
    Function to generate empty memory-mapped files for training and testing data sets
    """
    X_train = writeOutputToFile('X_train', data_dir, (len_train, X.shape[1]), 'float32')
    Y_train = writeOutputToFile('Y_train', data_dir, (len_train,), 'uint8')
    
    X_test =  writeOutputToFile('X_test', data_dir, (X.shape[0]-len_train, X.shape[1]), 'float32')
    Y_test = writeOutputToFile('Y_train', data_dir, (X.shape[0]-len_train,), 'uint8')
    return X_train, Y_train, X_test, Y_test

In [7]:
# loading the (X, Y) == (flux, target-types)data sets
X = np.load('Data_files/X_corrupted.npy', mmap_mode='r')
Y = np.load('Data_files/Y_corrupted.npy', mmap_mode='r')

# generate the indicies to train rather than samples, to reduce computational cost
indexes_train, len_train = trainIndicies(Y, ratio)

In [24]:
# create memory-mapped array for the (X, Y) training sets
X_train, Y_train, X_test, Y_test = generateTrainTestFiles(len_train, X, data_dir)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\1199\\Desktop\\Year 1, Sem 1\\TPVIa\\Data_files\\X_train.npy'

'C:\\Users\\1199\\Desktop\\Year 1, Sem 1\\TPVIa\\Data_files\\filename.npy'