# IMPORT LIBRARY

In [6]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

# LOAD DATA AND ANALYSIS

In [7]:
df= pd.read_csv("/kaggle/input/stellar-classification-dataset-sdss17/star_classification.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["class"].value_counts()

In [None]:
df["class"]=[0 if i == "GALAXY" else 1 if i == "STAR" else 2 for i in df["class"]]

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(8, 5))
sns.histplot(df['redshift'], kde=True, bins=30)
plt.title('Distribution of Redshift')
plt.xlabel('Redshift')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 5))
sns.boxplot(x=df['r'])
plt.title('Boxplot of r-band Magnitude')
plt.xlabel('r magnitude')
plt.show()

plt.figure(figsize=(10, 8))
corr = df[['u', 'g', 'r', 'i', 'z', 'redshift']].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Photometric Bands and Redshift')
plt.show()

plt.figure(figsize=(8, 5))
df['class'].value_counts().plot(kind='bar')
plt.title('Object Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['redshift'], y=df['r'], hue=df['class'], alpha=0.6)
plt.title('Redshift vs r-band Magnitude by Object Class')
plt.xlabel('Redshift')
plt.ylabel('r magnitude')
plt.legend(title='Class')
plt.show()

# OUTLIERS

In [None]:
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(df) 
x_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = x_score
threshold2 = -1.5                                            
filtre2 = outlier_score["score"] < threshold2
outlier_index = outlier_score[filtre2].index.tolist()

In [None]:
len(outlier_index)
df.drop(outlier_index, inplace=True)

In [None]:
f,ax = plt.subplots(figsize=(12,8))
sns.heatmap(df.corr(), cmap="PuBu", annot=True, linewidths=0.5, fmt= '.2f',ax=ax)
plt.show()
corr = df.corr()
corr["class"].sort_values()
df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','fiber_ID'], axis = 1)

In [None]:
len(outlier_index)

# BALANCING

In [2]:
from imblearn.over_sampling import SMOTE
from collections import Counter
x = df.drop(['class'], axis = 1)
y = df.loc[:,'class'].values
sm = SMOTE(random_state=42)
print('Org - Shape %s' % Counter(y))
x, y = sm.fit_resample(x, y)
print('Resampled - Shape %s' % Counter(y))
sns.countplot(y, palette='Set3')
plt.title("Class ",fontsize=10)
plt.show()

NameError: name 'df' is not defined

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

# IMPORT LIBRARY FOR TRAINING

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report


from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import ROCAUC
from yellowbrick.style import set_palette

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 123)

In [None]:
r_forest = RandomForestClassifier()
r_forest.fit(x_train,y_train)
predicted = r_forest.predict(x_test)
score = r_forest.score(x_test, y_test)
rf_score_ = np.mean(score)

print('Accuracy : %.3f' % (rf_score_))

# EVALUATE

In [3]:
classes = ['GALAXY','STAR','QSO']

In [4]:
r_forest_cm = ConfusionMatrix(r_forest, classes=classes, cmap='GnBu')

r_forest_cm.fit(x_train, y_train)
r_forest_cm.score(x_test, y_test)
r_forest_cm.show()

NameError: name 'ConfusionMatrix' is not defined

In [5]:
print(classification_report(y_test, predicted))

NameError: name 'classification_report' is not defined

In [None]:
visualizer = ClassPredictionError(r_forest, classes=classes)

set_palette('pastel')

visualizer.fit(x_train, y_train)        
visualizer.score(x_test, y_test)        
visualizer.show()

# UI TESTING

In [None]:
df= pd.read_csv("/kaggle/input/stellar-classification-dataset-sdss17/star_classification.csv")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import requests
from io import BytesIO

classes = df['class'].unique()

scale = 0.1 
width = 300 
height = 300  

fig, axes = plt.subplots(nrows=len(classes), ncols=5, figsize=(20, 12))

for i, cls in enumerate(classes):
    subset = df[df['class'] == cls]
    if len(subset) >= 5:
        subset = subset.sample(n=5)
    else:
        subset = subset.sample(n=len(subset))
    
    for j, (index, row) in enumerate(subset.iterrows()):
        ra = row['alpha']  
        dec = row['delta'] 
        
        url = f"http://skyserver.sdss.org/dr16/SkyServerWS/ImgCutout/getjpeg?ra={ra}&dec={dec}&scale={scale}&width={width}&height={height}"
        
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            axes[i, j].imshow(img)
            
            box_size = 80 
            rect = plt.Rectangle((width/2 - box_size/2, height/2 - box_size/2), box_size, box_size, 
                                 edgecolor='red', facecolor='none', linewidth=2)
            axes[i, j].add_patch(rect)
            
            # Đặt chữ ở bên ngoài hộp (phía trên hộp)
            axes[i, j].text(width/2, height/2 - box_size/2 - 10, cls, color='red', fontsize=8, ha='center', va='bottom')
            
            axes[i, j].axis('off')
        except Exception as e:
            print(f"Lỗi khi lấy ảnh cho ra={ra}, dec={dec}: {e}")
            axes[i, j].text(0.5, 0.5, 'No image', ha='center', va='center')
            axes[i, j].axis('off')
    
    axes[i, 0].set_ylabel(cls, rotation=0, labelpad=20, fontsize=12)

plt.tight_layout()
plt.show()