In [1]:
import numpy as np

In [2]:
data = np.load('galaxy_catalogue.npy')

In [8]:
for name, value in zip(data.dtype.names, data[0]):
  print('{:10} {:.6}'.format(name, value))

u-g        1.85765
g-r        0.67158
r-i        0.4231
i-z        0.3061
ecc        0.585428
m4_u       2.25195
m4_g       2.33985
m4_r       2.38065
m4_i       2.35974
m4_z       2.39553
petroR50_u 3.09512
petroR50_r 3.81892
petroR50_z 3.82623
petroR90_u 5.17481
petroR90_r 8.26301
petroR90_z 11.4773
class      merger


In [13]:
from sklearn.model_selection import train_test_split

def splitdata_train_test(data, fraction_training):
    train_size = int(data.shape[0] * fraction_training)
    return train_test_split(data, train_size=train_size, shuffle=True)

In [14]:
data = np.load('galaxy_catalogue.npy')

# set the fraction of data which should be in the training set
fraction_training = 0.7

# split the data using your function
training, testing = splitdata_train_test(data, fraction_training)

# print the key values
print('Number data galaxies:', len(data))
print('Train fraction:', fraction_training)
print('Number of galaxies in training set:', len(training))
print('Number of galaxies in testing set:', len(testing))

Number data galaxies: 780
Train fraction: 0.7
Number of galaxies in training set: 546
Number of galaxies in testing set: 234


In [15]:
def generate_features_targets(data):
  # complete the function by calculating the concentrations

  targets = data['class']

  features = np.empty(shape=(len(data), 13))
  features[:, 0] = data['u-g']
  features[:, 1] = data['g-r']
  features[:, 2] = data['r-i']
  features[:, 3] = data['i-z']
  features[:, 4] = data['ecc']
  features[:, 5] = data['m4_u']
  features[:, 6] = data['m4_g']
  features[:, 7] = data['m4_r']
  features[:, 8] = data['m4_i']
  features[:, 9] = data['m4_z']

  # fill the remaining 3 columns with concentrations in the u, r and z filters
  # concentration in u filter
  features[:, 10] = data['petroR50_u'] / data['petroR90_u']
  # concentration in r filter
  features[:, 11] = data['petroR50_r'] / data['petroR90_r']
  # concentration in z filter
  features[:, 12] = data['petroR50_z'] / data['petroR90_z']

  return features, targets

In [16]:
data = np.load('galaxy_catalogue.npy')

features, targets = generate_features_targets(data)

# Print the shape of each array to check the arrays are the correct dimensions. 
print("Features shape:", features.shape)
print("Targets shape:", targets.shape)

Features shape: (780, 13)
Targets shape: (780,)


In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
from sklearn.tree import DecisionTreeClassifier

def dtc_predict_actual(data):
  # split the data into training and testing sets using a training fraction of 0.7
  train, test = splitdata_train_test(data, 0.7)

  # generate the feature and targets for the training and test sets
  # i.e. train_features, train_targets, test_features, test_targets
  train_features, train_targets = generate_features_targets(train)
  test_features, test_targets = generate_features_targets(test)

  # instantiate a decision tree classifier
  dtc = DecisionTreeClassifier()

  # train the classifier with the train_features and train_targets
  dtc.fit(train_features, train_targets)

  # get predictions for the test_features
  predictions = dtc.predict(test_features)

  # return the predictions and the test_targets
  return predictions, test_targets

In [19]:
data = np.load('galaxy_catalogue.npy')

predicted_class, actual_class = dtc_predict_actual(data)

# Print some of the initial results
print("Some initial results...\n   predicted,  actual")
for i in range(10):
    print("{}. {}, {}".format(i, predicted_class[i], actual_class[i]))

Some initial results...
   predicted,  actual
0. spiral, spiral
1. merger, spiral
2. merger, spiral
3. spiral, spiral
4. spiral, spiral
5. merger, spiral
6. spiral, spiral
7. spiral, spiral
8. spiral, spiral
9. spiral, merger
