In [1]:
import os
import numpy as np
import pandas as pd
from analysis import *

from pyclustering.cluster import cluster_visualizer_multidim
from pyclustering.cluster.xmeans import xmeans
from pyclustering.cluster.kmeans import *
from pyclustering.cluster.elbow import elbow

from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer

import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import numpy as np

import seaborn as sns

In [2]:
sdss_df = import_sdss ("%s/Databases/SDSSMOC4/data/sdssmocadr4.tab" % os.getcwd())

# Drop information not relevant to this problem, along with the SMOC_ID, which just indicates
# an SDSS observation; one asteroid can have multiple SMOC_ID, so it isn't a useful identifier.
sdss_df.drop(labels=['OBJ_ID_RUN', 'OBJ_ID_COL', 'OBJ_ID_FIELD',
                     'OBJ_ID_OBJ', 'ROWC', 'COLC', 'JD_ZERO', 'RA',
                     'DEC', 'LAMBDA', 'BETA', 'PHI', 'VMU', 'VMU_ERROR',
                     'VNU', 'VNU_ERROR', 'VLAMBDA', 'VBETA', 'IDFLAG',
                     'RA_COMPUTED', 'DEC_COMPUTED', 'V_MAG_COMPUTED',
                     'R_DIST', 'G_DIST', 'OSC_CAT_ID', 'ARC',
                     'EPOCH_OSC', 'A_OSC', 'E_OSC', 'I_OSC', 'LON_OSC',
                     'AP_OSC', 'M_OSC', 'PROP_CAT_ID', 'A_PROP',
                     'E_PROP', 'SIN_I_PROP', 'V_MAG', 'B_MAG', 'H', 'G',
                     'A_MAG', 'A_ERR', 'SMOC_ID', 'PHASE', 'D_COUNTER', 
                     'TOTAL_D_COUNT'],
                     axis=1, inplace=True)

In [3]:
wave_mags = ["U_MAG", "G_MAG", "R_MAG", "I_MAG", "Z_MAG"]
wave_errs = ["U_ERR", "G_ERR", "R_ERR", "I_ERR", "Z_ERR"]

# Ensure data integrity. Every observation MUST have a U, G, R, I, and Z value, and associated error.
sdss_df.dropna(subset=wave_mags + wave_errs, inplace=True)

In [4]:
# See p.18 of Solar System Objects Observed in the Sloan Digital Sky Survey Commissioning Data, Ivzek et al
calc_ref_colors(sdss_df, wave_mags, [1.32, 0.45, 0.10, 0.04])

In [5]:
sdss_df

Unnamed: 0,U_MAG,U_ERR,G_MAG,G_ERR,R_MAG,R_ERR,I_MAG,I_ERR,Z_MAG,Z_ERR,AST_NUMBER,PROV_ID,U_MAG_REFL_COLOR,G_MAG_REFL_COLOR,R_MAG_REFL_COLOR,I_MAG_REFL_COLOR
0,23.51,0.64,21.55,0.22,21.14,0.06,20.87,0.07,20.89,0.30,0,-,0.64,-4.000000e-02,1.700000e-01,-0.06
1,21.81,0.20,20.32,0.04,19.77,0.02,19.56,0.03,19.44,0.14,62869,2000 UO84,0.17,1.000000e-01,1.100000e-01,0.08
2,25.02,0.95,23.43,0.33,21.50,0.09,20.68,0.07,20.14,0.15,0,-,0.27,1.480000e+00,7.200000e-01,0.50
3,23.28,0.54,21.48,0.06,20.78,0.05,20.75,0.07,20.93,0.30,0,2004 TG250,0.48,2.500000e-01,-7.000000e-02,-0.22
4,19.69,0.03,17.91,0.02,17.32,0.01,17.10,0.02,17.05,0.03,5212,1989 SS,0.46,1.400000e-01,1.200000e-01,0.01
5,23.10,0.44,21.76,0.08,21.20,0.06,20.99,0.07,21.40,0.43,0,-,0.02,1.100000e-01,1.100000e-01,-0.45
6,21.53,0.12,19.88,0.02,19.23,0.02,19.12,0.03,19.04,0.06,0,2003 WJ38,0.33,2.000000e-01,1.000000e-02,0.04
7,21.83,0.16,19.82,0.02,19.16,0.02,18.94,0.02,19.02,0.06,35549,1998 FT108,0.69,2.100000e-01,1.200000e-01,-0.12
8,22.89,0.37,21.79,0.08,21.08,0.05,20.87,0.07,20.92,0.28,0,-,-0.22,2.600000e-01,1.100000e-01,-0.09
9,22.93,0.42,21.11,0.04,20.40,0.03,20.20,0.04,19.90,0.12,0,2000 WD18,0.50,2.600000e-01,1.000000e-01,0.26


In [None]:
training_data = sdss_df[wave_mags + wave_errs].values.tolist()

In [None]:
# Visualizing 5-D mix data using bubble charts
# leveraging the concepts of hue, size and depth

plot_5d(sdss_df["U_MAG"], sdss_df["G_MAG"], sdss_df["R_MAG"], sdss_df["I_MAG"], sdss_df["Z_MAG"],
        *wave_mags, "U_MAG - G_MAG - R_MAG - I_MAG - Z_MAG")

plot_5d(sdss_df["Z_MAG"], sdss_df["I_MAG"], sdss_df["R_MAG"], sdss_df["G_MAG"], sdss_df["U_MAG"],
        *(wave_mags[::-1]), "Z_MAG - I_MAG - R_MAG - G_MAG - U_MAG")

In [None]:
plot_5d(sdss_df["G_MAG"], sdss_df["Z_MAG"], sdss_df["I_MAG"], sdss_df["U_MAG"], sdss_df["R_MAG"],
        *["G_MAG", "Z_MAG", "I_MAG", "U_MAG", "R_MAG"],
        "G_MAG - Z_MAG - I_MAG - U_MAG - R_MAG")

In [None]:
kmin, kmax = 1, 20
elbow_instance = elbow(training_data, kmin, kmax)

In [None]:
# process input data and obtain results of analysis
elbow_instance.process()
amount_clusters = elbow_instance.get_amount()   # most probable amount of clusters
wce = elbow_instance.get_wce()                  # total within-cluster errors for each K

In [None]:
amount_clusters

In [None]:
# perform cluster analysis using K-Means algorithm
centers = kmeans_plusplus_initializer(training_data, amount_clusters).initialize()
kmeans_instance = kmeans(training_data, centers)
kmeans_instance.process()

In [None]:
# obtain clustering results and visualize them
clusters = kmeans_instance.get_clusters()
centers = kmeans_instance.get_centers()

In [None]:
draw_5d_clusters(sdss_df["G_MAG"], sdss_df["Z_MAG"], sdss_df["I_MAG"], sdss_df["U_MAG"], sdss_df["R_MAG"],
                 clusters, "G_MAG", "Z_MAG", "I_MAG", "U_MAG", "R_MAG", "G_MAG - Z_MAG - I_MAG - U_MAG - R_MAG")