![title](imagens/a2.jpg)

# Classificação de galáxias, estrelas e quasares.


**Author: Rafael Gallo**

**Projeto de classificação astros espaciais**

**Algoritmo ultilizado: K-NN, Naive bayes**

**Contexto**

Sloan Digital Sky Survey atual versão de dados do servidor DR16 com galáxias, estrelas e quasares.


**Conteúdo**

A tabela resulta de uma consulta que une duas tabelas:

"PhotoObj" que contém dados fotométricos
"SpecObj" que contém dados espectrais.
16 variáveis ​​(double) e 1 variável adicional (char) 'class'.
Um objeto de classe pode ser previsto a partir das outras 16 variáveis.





# Inspiração
O Sloan Digital Sky Survey criou os mapas tridimensionais mais detalhados do Universo já feitos, com imagens multicoloridas profundas de um terço do céu e espectros de mais de três milhões de objetos astronômicos. Permite conhecer e explorar todas as fases e pesquisas - passadas, presentes e futuras - do SDSS.

In [None]:
## Descrição das variáveis

## objid = Identificador de objeto
## ra = J2000 Ascensão Reta (banda r)
## dec = Declinação J2000 (banda r)
## u = melhor do ajuste de magnitude de deV / Exp (banda u)
## g = melhor ajuste de magnitude de deV / Exp ( g-band)
## r = melhor do ajuste de magnitude de deV / Exp (banda r)
## i = melhor de ajuste de magnitude de deV / Exp (banda i)
## z = melhor de ajuste de magnitude de deV / Exp (banda z)
## run = Run Number
## rerun = Número de repetição camcol 
## Campo da coluna da câmera = Número do campo
## specobjid = Classe de identificador de objeto = classe de objeto (objeto de galáxia, estrela ou quasar)
## redshift = Placa final Redshift número da placa
## mjd = MJD da observação
## fibreid = fibreID

In [None]:
from platform import python_version

print('Versão Jupyter Notebook neste projeto:', python_version())

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.style.use('seaborn-darkgrid')
sns.set_style("darkgrid") 

sns.set(style="darkgrid", color_codes=True, font_scale=1.5)
color = sns.color_palette()

In [None]:
df = pd.read_csv("Skyserver_12_30_2019 4_49_58 PM.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.sum()

In [None]:
df.describe()

In [None]:
corr = df.corr()
corr

In [None]:
plt.figure(figsize=(35, 10))

sns.heatmap(corr,  cmap = 'plasma', annot= True);

# Análise exploratória

In [None]:
stars = df[df['class'] == 'STAR']
quasars = df[df['class'] == 'QSO']
galaxies = df[df['class'] == 'GALAXY']

In [None]:
plt.figure(figsize=(15, 8))

ax = sns.countplot(df["class"])

In [None]:
plt.figure(figsize=(15, 8))

ax = sns.boxplot(df["redshift"])

In [None]:
plt.figure(figsize=(15, 8))

sns.distplot(stars["redshift"])

In [None]:
plt.figure(figsize=(15, 8))

sns.distplot(galaxies["redshift"])

In [None]:
plt.figure(figsize=(15, 8))

sns.distplot(quasars["redshift"])

In [None]:
sns.set(style='darkgrid')
color_palette = 'GnBu_d'

fig, axs = plt.subplots(nrows=3)
fig = plt.gcf()
fig.set_size_inches(13,9)
plt.subplots_adjust(hspace=0.8)

sns.boxplot(stars['redshift'], palette=color_palette, ax=axs[0]).set_title('Stars')
sns.boxplot(galaxies['redshift'], palette=color_palette, ax=axs[1]).set_title('Galaxies')
sns.boxplot(quasars['redshift'], palette=color_palette, ax=axs[2]).set_title('Quasars')
plt.show()

In [None]:
sns.pairplot(df)

# Treino e Teste

In [None]:
df = df.drop(columns=['objid', 
                      'ra',
                      'dec', 
                      'run', 
                      'rerun', 
                      'camcol',
                      'field',
                      'specobjid', 
                      'plate', 
                      'mjd', 
                      'fiberid'])
df.head()

In [None]:
x = df[['u','g','r','i','z','redshift']]
y = pd.factorize(df['class'])[0]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

# Pré processamento

In [None]:
from sklearn.preprocessing import RobustScaler

Scaler = RobustScaler()
scaler_train = Scaler.fit_transform(x_train)
scaler_test = Scaler.fit_transform(x_test)

In [None]:
scaler_train

In [None]:
scaler_test

# Modelo de machine learning

# **Naive bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB

naive = GaussianNB()
naive.fit(x_train, y_train)
naive_pred = naive.predict(x_test)
naive_scor = naive.score(x_train, y_train)
naive_scor

In [None]:
# Previsão do naive bayes
naive_pred

# K-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 15, metric = "minkowski", p = 2)
knn.fit(x_train, y_train)
knn_pred = knn.predict(x_test)
knn_scor = knn.score(x_train, y_train)
knn_scor

In [None]:
# Previsão do K-NN
knn_pred

# Árvore de decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

de_tree = DecisionTreeClassifier()
de_tree = de_tree.fit(x_train, y_train)
de_score = de_tree.score(x_train, y_train)
de_score

In [None]:
# Previsão da árvore de decisão

tree_pred = de_tree.predict(x_test)
tree_pred

In [None]:
from sklearn.metrics import accuracy_score

acuracia_naive_bayes = accuracy_score(y_test, naive_pred)
acuracia_KNN = accuracy_score(y_test, knn_pred)
acuracia_tree = accuracy_score(y_test, tree_pred)

print("Acuracia modelo 1 - Naive bayes foi: %.3f" % (acuracia_naive_bayes * 100))
print("Acuracia modelo 2 - K-NN foi: %.3f" % (acuracia_KNN * 100))
print("Acuracia modelo 3 - Arvore de decisão foi: %.3f" % (acuracia_tree * 100))

In [None]:
from sklearn.metrics import confusion_matrix

matrix_confusion_1 =  confusion_matrix(y_test, naive_pred)
matrix_confusion_2 =  confusion_matrix(y_test, knn_pred)
matrix_confusion_3 =  confusion_matrix(y_test, tree_pred)

In [None]:
y_true  = ['GALAXY', 'QUASAR', "STAR"]
y_pred  = ['GALAXY', 'QUASAR', "STAR"]

matrix_confusion =  confusion_matrix(y_test, naive_pred)

df_cm = pd.DataFrame(matrix_confusion_1, columns=np.unique(y_true), index = np.unique(y_true))

df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)

plt.title("Model 1 - Matrix confusion - Naive bayes")
ax = sns.heatmap(df_cm, cmap = 'plasma', annot=True,annot_kws={"size": 16}, fmt = "")

In [None]:
y_true  = ['GALAXY', 'QUASAR', "STAR"]
y_pred  = ['GALAXY', 'QUASAR', "STAR"]

matrix_confusion =  confusion_matrix(y_test, naive_pred)

df_cm = pd.DataFrame(matrix_confusion_2, columns=np.unique(y_true), index = np.unique(y_true))

df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)

plt.title("Model 2 - Matrix confusion - KNN")
sns.heatmap(df_cm, cmap = 'plasma', annot=True,annot_kws={"size": 16}, fmt = "")

In [None]:
plt.style.use('seaborn-darkgrid')
sns.set_style("darkgrid") 

In [None]:
y_true  = ['GALAXY', 'QUASAR', "STAR"]
y_pred  = ['GALAXY', 'QUASAR', "STAR"]

matrix_confusion =  confusion_matrix(y_test, naive_pred)

df_cm = pd.DataFrame(matrix_confusion_3, columns=np.unique(y_true), index = np.unique(y_true))

df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)

plt.title("Model 3 - Matrix confusion - Árvore de decisão")
sns.heatmap(df_cm, cmap = 'plasma', annot=True, annot_kws = {"size": 16}, fmt = "")

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print("Precision - Naive bayes = {}".format(precision_score(y_test, naive_pred, average='macro')))
print("Recall - Naive bayes = {}".format(recall_score(y_test, naive_pred, average='macro')))
print("Accuracy - Naive bayes = {}".format(accuracy_score(y_test, naive_pred)))
print("F1 Score - Naive bayes = {}".format(f1_score(y_test, naive_pred, average='macro')))
print("\n")

print("Precision - K-NN = {}".format(precision_score(y_test, knn_pred, average='macro')))
print("Recall - K-NN = {}".format(recall_score(y_test, knn_pred, average='macro')))
print("Accuracy - K-NN = {}".format(accuracy_score(y_test, knn_pred)))
print("F1 Score - K-NN = {}".format(f1_score(y_test, knn_pred, average='macro')))
print("\n")

print("Precision - Arvore de decisão = {}".format(precision_score(y_test, tree_pred, average='macro')))
print("Recall - Arvore de decisão = {}".format(recall_score(y_test, tree_pred, average='macro')))
print("Accuracy - Arvore de decisão = {}".format(accuracy_score(y_test, tree_pred)))
print("F1 Score - Árvore de decisão = {}".format(f1_score(y_test, tree_pred, average='macro')))
print("\n")

In [None]:
from sklearn.metrics import classification_report

print("Naive bayes", classification_report(y_test, naive_pred))
print("\n")
print("K-NN", classification_report(y_test, knn_pred))
print("\n")
print("Arvore de decisão", classification_report(y_test, tree_pred))