### Importing libraries

In [73]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import re

### Importing the dataset

In [74]:
path ='/Users/maryrosejofelvillacampa/Desktop/datasets/astronomical_data.csv'
df = pd.read_csv(path)
df = pd.DataFrame(df)
df

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3042,0.0005,0.1542,16.6,0,Red,M
1,2600,0.0003,0.102,18.7,0,Red,M
2,2800,0.0002,,16.65,0,Red,M
3,1939,0.000138,0.103,20.06,0,Red,M
4,2840,,0.11,16.98,0,Red,M
...,...,...,...,...,...,...,...
234,38940,374830,1356,-9.93,5,Blue,O
235,30839,834042,1194,-10.63,5,Blue,O
236,8829,537493,1423,-10.73,5,White,A
237,9235,404940,1112,-11.23,5,White,A


### Data exploration

In [75]:
df[["Temperature (K)", "Luminosity(L/Lo)", "Radius(R/Ro)", "Absolute magnitude(Mv)", "Star type"]] = \
df[["Temperature (K)", "Luminosity(L/Lo)", "Radius(R/Ro)", "Absolute magnitude(Mv)", "Star type"]].apply(pd.to_numeric, errors='coerce')
df["Star color"] = pd.Categorical(df["Star color"])
df["Spectral Class"] = pd.Categorical(df["Spectral Class"])

df["Star color"] = df["Star color"].replace("\s", np.nan, regex=True)
df["Spectral Class"] = df["Spectral Class"].replace("\s", np.nan, regex=True)

shape = df.shape
print(f"There are {shape[0]} rows by {shape[1]} columns.")

dupes = df.duplicated().sum()
if dupes == 0:
    print("There are no duplicated rows in the dataset.")
else:
    print(f"There are {dupes} duplicated rows in the dataset")

print("The number of empty cells are listed below per feature.")
nulls = df.isnull().sum()
print(nulls)


There are 239 rows by 7 columns.
There are no duplicated rows in the dataset.
The number of empty cells are listed below per feature.
Temperature (K)           2
Luminosity(L/Lo)          6
Radius(R/Ro)              7
Absolute magnitude(Mv)    5
Star type                 0
Star color                6
Spectral Class            2
dtype: int64


### Data preprocessing

#### Data cleaning

##### Temperature (K)

In [76]:
mean_temp = df["Temperature (K)"].mean()
df["Temperature (K)"] = df["Temperature (K)"].fillna(mean_temp)

null_temp = df["Temperature (K)"].isnull().sum()
if null_temp == 0:
    print("There are zero null values in the Temperature column")
else:
    print(f"There are {null_temp} empty cells in the column.")

There are zero null values in the Temperature column


##### Luminosity (L/Lo)

In [77]:
mean_lum = df["Luminosity(L/Lo)"].mean()
df["Luminosity(L/Lo)"] = df["Luminosity(L/Lo)"].fillna(mean_lum)

null_lum = df["Luminosity(L/Lo)"].isnull().sum()
if null_lum == 0:
    print("There are zero null values in the Luminosity column")
else:
    print(f"There are {null_temp} empty cells in the column.")

There are zero null values in the Luminosity column


##### Absolute magnitude (Mv)

In [78]:
mean_mv = df["Absolute magnitude(Mv)"].mean()
df["Absolute magnitude(Mv)"] = df["Absolute magnitude(Mv)"].fillna(mean_mv)

null_mv = df["Absolute magnitude(Mv)"].isnull().sum()
if null_mv == 0:
    print("There are zero null values in the Absolute magnitude column.")
else:
    print(f"There are {null_mv} empty cells in the column.")

There are zero null values in the Absolute magnitude column.


##### Radius (R/Ro)

In [79]:
mean_rad = df["Radius(R/Ro)"].mean()
df["Radius(R/Ro)"] = df["Radius(R/Ro)"].fillna(mean_rad)

null_rad = df["Radius(R/Ro)"].isnull().sum()
if null_rad == 0:
    print("There are zero null values in the Radius column.")
else:
    print(f"There are {null_rad} empty cells in the column.")

There are zero null values in the Radius column.


##### Spectral Class

In [80]:
df = df.dropna(subset=["Spectral Class"])

In [81]:
nulls = df.isnull().sum()
print(nulls)

Temperature (K)           0
Luminosity(L/Lo)          0
Radius(R/Ro)              0
Absolute magnitude(Mv)    0
Star type                 0
Star color                6
Spectral Class            0
dtype: int64


In [86]:
print(df["Spectral Class"].unique())
print(df["Spectral Class"].value_counts())

['M', 'B', 'A', 'F', 'O', 'K', 'G']
Categories (8, object): [' ', 'A', 'B', 'F', 'G', 'K', 'M', 'O']
Spectral Class
M    109
B     46
O     39
A     19
F     17
K      6
G      1
       0
Name: count, dtype: int64


In [88]:

df["Spectral Class"] = df["Spectral Class"].replace({
    "M": 0,
    "K": 1,
    "G": 2,
    "F": 3,
    "A": 4,
    "B": 5,
    "O": 6,
})
df["Spectral Class"] = pd.to_numeric(df["Spectral Class"])
df

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3042.0,0.000500,0.154200,16.60,0,3,0
1,2600.0,0.000300,0.102000,18.70,0,3,0
2,2800.0,0.000200,238.393907,16.65,0,3,0
3,1939.0,0.000138,0.103000,20.06,0,3,0
4,2840.0,107266.981895,0.110000,16.98,0,3,0
...,...,...,...,...,...,...,...
234,38940.0,374830.000000,1356.000000,-9.93,5,1,5
235,30839.0,834042.000000,1194.000000,-10.63,5,1,5
236,8829.0,537493.000000,1423.000000,-10.73,5,4,4
237,9235.0,404940.000000,1112.000000,-11.23,5,4,4


##### Star color

In [82]:
df["Star color"] = df["Star color"].replace("\s", np.nan, regex=True) #replacing empty values (white spaces) in cells with NaN

features = ['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)', 'Spectral Class']

df_missing = df[df["Star color"].isnull()]
df_not_missing = df[df["Star color"].notnull()]

df_not_missing["Star color"] = pd.Categorical(df_not_missing["Star color"]).codes

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(df_not_missing[features], df_not_missing["Star color"])

df_missing["Star color"] = knn.predict(df_missing[features])

df['Star color'] = pd.Categorical(df['Star color'])
df['Star color'] = pd.Categorical(df['Star color'], categories=pd.Categorical(df_not_missing['Star color']).categories)

dfn = pd.concat([df_not_missing, df_missing]).sort_index()

df = pd.DataFrame(dfn)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Star color"] = df["Star color"].replace("\s", np.nan, regex=True) #replacing empty values (white spaces) in cells with NaN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_not_missing["Star color"] = pd.Categorical(df_not_missing["Star color"]).codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3042.0,0.000500,0.154200,16.60,0,3,M
1,2600.0,0.000300,0.102000,18.70,0,3,M
2,2800.0,0.000200,238.393907,16.65,0,3,M
3,1939.0,0.000138,0.103000,20.06,0,3,M
4,2840.0,107266.981895,0.110000,16.98,0,3,M
...,...,...,...,...,...,...,...
234,38940.0,374830.000000,1356.000000,-9.93,5,1,O
235,30839.0,834042.000000,1194.000000,-10.63,5,1,O
236,8829.0,537493.000000,1423.000000,-10.73,5,4,A
237,9235.0,404940.000000,1112.000000,-11.23,5,4,A


In [83]:
# df["Star color"] = pd.Categorical(df["Star color"])

# df["Star color"] = df["Star color"].replace({
#     0: "Blue",
#     1: "Blue-white",
#     2: "Red",
#     3: "White",
#     4: "Yellow-white",
# })

# df

In [84]:
nulls = df.isnull().sum()
print(nulls)

Temperature (K)           0
Luminosity(L/Lo)          0
Radius(R/Ro)              0
Absolute magnitude(Mv)    0
Star type                 0
Star color                0
Spectral Class            0
dtype: int64


#### Defining features and data splitting

### Data modeling