In [1]:
import pandas as pd
import numpy as np


data = pd.read_csv('Stars.csv')
data.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


### Preprocessing / Data cleaning

In [2]:
data.isnull().sum()

Temperature       0
L                 0
R                 0
A_M               0
Color             0
Spectral_Class    0
Type              0
dtype: int64

In [3]:
Color = list(data['Color'].unique())
Color

['Red',
 'Blue White',
 'White',
 'Yellowish White',
 'Blue white',
 'Pale yellow orange',
 'Blue',
 'Blue-white',
 'Whitish',
 'yellow-white',
 'Orange',
 'White-Yellow',
 'white',
 'yellowish',
 'Yellowish',
 'Orange-Red',
 'Blue-White']

In [4]:
Spe_Class = list(data['Spectral_Class'].unique())
Spe_Class

['M', 'B', 'A', 'F', 'O', 'K', 'G']

In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

encoder = LabelEncoder()
data['Color'] = encoder.fit_transform(data['Color'])
data['Spectral_Class'] = encoder.fit_transform(data['Spectral_Class'])


In [6]:
X = data.drop('Type', axis = 1)
y = data['Type']

In [7]:
y.value_counts()

Type
0    40
1    40
2    40
3    40
4    40
5    40
Name: count, dtype: int64

In [8]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state = 42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

cohen_kappa = cohen_kappa_score(y_test, y_pred)

metrics = []
metrics.append(["Logistic Regression", accuracy, cohen_kappa])

# Print metrics in a table
metrics_df = pd.DataFrame(metrics, columns=['Classifier', 'Accuracy', 'Cohen Kappa'])
print(metrics_df)

            Classifier  Accuracy  Cohen Kappa
0  Logistic Regression  0.983333     0.979933


In [10]:
# import joblib

# # Save the model to a file
# model_filename = "Stars_model.joblib"
# joblib.dump(model, model_filename)


In [11]:
# # Load the model back
# loaded_model = joblib.load(model_filename)

In [12]:
# y_pre = loaded_model.predict(X_test)
# print(accuracy_score(y_test, y_pre))

In [13]:
data.columns

Index(['Temperature', 'L', 'R', 'A_M', 'Color', 'Spectral_Class', 'Type'], dtype='object')