# **Stars Classification**

This is a dataset consisting of several features of stars.

Some of them are:

- Absolute Temperature (in K)
- Relative Luminosity (L/Lo)
- Relative Radius (R/Ro)
- Absolute Magnitude (Mv)
- Star Color (white,Red,Blue,Yellow,yellow-orange etc)
- Spectral Class (O,B,A,F,G,K,,M)
- Star Type **(Red Dwarf, Brown Dwarf, White Dwarf, Main Sequence , SuperGiants, HyperGiants)**
- Lo = 3.828 x 10^26 Watts (Avg Luminosity of Sun)
- Ro = 6.9551 x 10^8 m (Avg Radius of Sun)

In [1]:
# import library
import pandas as pd
import numpy as np

In [2]:
# import data
star = pd.read_csv ('Stars.csv')

In [3]:
star.head()

Unnamed: 0,Temperature (K),Luminosity (L/Lo),Radius (R/Ro),Absolute magnitude (Mv),Star type,Star category,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Brown Dwarf,Red,M
1,3042,0.0005,0.1542,16.6,0,Brown Dwarf,Red,M
2,2600,0.0003,0.102,18.7,0,Brown Dwarf,Red,M
3,2800,0.0002,0.16,16.65,0,Brown Dwarf,Red,M
4,1939,0.000138,0.103,20.06,0,Brown Dwarf,Red,M


In [4]:
star.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Temperature (K)          240 non-null    int64  
 1   Luminosity (L/Lo)        240 non-null    float64
 2   Radius (R/Ro)            240 non-null    float64
 3   Absolute magnitude (Mv)  240 non-null    float64
 4   Star type                240 non-null    int64  
 5   Star category            240 non-null    object 
 6   Star color               240 non-null    object 
 7   Spectral Class           240 non-null    object 
dtypes: float64(3), int64(2), object(3)
memory usage: 15.1+ KB


In [5]:
star.describe()

Unnamed: 0,Temperature (K),Luminosity (L/Lo),Radius (R/Ro),Absolute magnitude (Mv),Star type
count,240.0,240.0,240.0,240.0,240.0
mean,10497.4625,107188.361635,237.157781,4.382396,2.5
std,9552.425037,179432.24494,517.155763,10.532512,1.711394
min,1939.0,8e-05,0.0084,-11.92,0.0
25%,3344.25,0.000865,0.10275,-6.2325,1.0
50%,5776.0,0.0705,0.7625,8.313,2.5
75%,15055.5,198050.0,42.75,13.6975,4.0
max,40000.0,849420.0,1948.5,20.06,5.0


In [6]:
# number of categories
star[['Spectral Class']].value_counts()

Spectral Class
M                 111
B                  46
O                  40
A                  19
F                  17
K                   6
G                   1
Name: count, dtype: int64

In [7]:
star.columns = star.columns.str.strip()  # Remove any leading/trailing spaces


In [8]:
# Define the 3-class mapping
spectral_mapping = {
    'O': 0,  # Blue Giants
    'B': 0,  # Blue Giants
    'A': 1,  # Solar Analogs
    'F': 1,  # Solar Analogs
    'G': 1,  # Solar Analogs
    'K': 2,  # Cool Dwarfs
    'M': 2   # Cool Dwarfs
}

# Apply the mapping to the target column
star['Spectral_Group'] = star['Spectral Class'].map(spectral_mapping)

# Verify the transformation
print(star['Spectral_Group'].value_counts())

Spectral_Group
2    117
0     86
1     37
Name: count, dtype: int64


In [9]:
star[['Spectral_Group']].value_counts()

Spectral_Group
2                 117
0                  86
1                  37
Name: count, dtype: int64

In [10]:
# number of categories
star[['Star type']].value_counts()

Star type
0            40
1            40
2            40
3            40
4            40
5            40
Name: count, dtype: int64

In [11]:
star[['Star color']].value_counts()

Star color        
Red                   112
Blue                   55
Blue-white             26
Blue White             10
yellow-white            8
White                   7
Blue white              3
white                   3
Yellowish White         3
Whitish                 2
yellowish               2
Orange                  2
Blue                    1
White-Yellow            1
Orange-Red              1
Yellowish               1
Blue-White              1
Blue white              1
Pale yellow orange      1
Name: count, dtype: int64

In [12]:
# Normalize color names by making lowercase and removing spaces
star['Star color'] = star['Star color'].str.lower().str.strip()

# Define categories
color_mapping = {
    'red': 0,
    'orange': 2,  # Group similar shades
    'yellow': 2,
    'white': 3,  'white ': 3,
    'blue': 4, 'blue ': 4,
}

# Apply encoding
star['Star color'] = star['Star color'].replace(color_mapping)


In [13]:
# number of categories
star[['Star color']].value_counts()

Star color        
0                     112
4                      56
blue-white             27
blue white             14
3                      10
yellow-white            8
yellowish               3
yellowish white         3
2                       2
whitish                 2
orange-red              1
pale yellow orange      1
white-yellow            1
Name: count, dtype: int64

In [14]:
from sklearn.preprocessing import StandardScaler


# Define the target and features
y = star['Spectral_Group']
X = star[['Temperature (K)', 'Luminosity (L/Lo)', 'Radius (R/Ro)', 'Absolute magnitude (Mv)']]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the features and transform them
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back into a DataFrame for better readability
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Show the scaled features
print(X_scaled_df)


     Temperature (K)  Luminosity (L/Lo)  Radius (R/Ro)  \
0          -0.779382          -0.598624      -0.459210   
1          -0.782110          -0.598624      -0.459241   
2          -0.828477          -0.598624      -0.459342   
3          -0.807496          -0.598624      -0.459229   
4          -0.897819          -0.598624      -0.459340   
..               ...                ...            ...   
235         2.983743           1.494720       2.167974   
236         2.133913           4.059319       1.854068   
237        -0.175029           2.403157       2.297800   
238        -0.132438           1.662878       1.695177   
239         2.872754           1.048345       2.995370   

     Absolute magnitude (Mv)  
0                   1.116745  
1                   1.162414  
2                   1.362213  
3                   1.167171  
4                   1.491607  
..                       ...  
235                -1.361718  
236                -1.428317  
237                -1.43

In [15]:
# split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 290)

In [16]:
print(star.columns)

Index(['Temperature (K)', 'Luminosity (L/Lo)', 'Radius (R/Ro)',
       'Absolute magnitude (Mv)', 'Star type', 'Star category', 'Star color',
       'Spectral Class', 'Spectral_Group'],
      dtype='object')


In [17]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
classes = star['Spectral_Group'].unique()  # Get unique classes
class_weights = compute_class_weight(
    class_weight='balanced',  # Automatically balances weights
    classes=classes,          # List of unique classes
    y=star['Spectral_Group']  # Target column
)

# Convert to a dictionary for model input
class_weight_dict = dict(zip(classes, class_weights))

print("Class Weights:", class_weight_dict)

Class Weights: {2: 0.6837606837606838, 0: 0.9302325581395349, 1: 2.1621621621621623}


In [18]:
from xgboost import XGBClassifier

# Train XGBoost with class weights
model = XGBClassifier(
    objective='multi:softmax',  # For multi-class classification
    num_class=len(classes),     # Number of classes
    use_label_encoder=False,    # Disable label encoder
    eval_metric='mlogloss',     # Multi-class log loss
    scale_pos_weight=None       # Not used for multi-class
)

# Fit the model with class weights
model.fit(X_train, y_train, sample_weight=star['Spectral Class'].map(class_weight_dict))

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# predict
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
# import function
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
star.describe()

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-validation accuracy: {scores.mean():.2f} ± {scores.std():.2f}')


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
print(f'Stratified CV Accuracy: {scores.mean():.2f} ± {scores.std():.2f}')


transform the  .ipynb file into a .pkl file (model)

In [None]:
import pickle

# Save the trained model
with open('star_classification_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the fitted scaler
with open('star_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model and scaler exported successfully.")
