# Imports
---

In [1]:
import numpy as np 
import pandas as pd

# Cargar Data
___

In [2]:
#Paso names para quitar espacios
names=['Temperature', 'Luminosity', 'Radius', 'AbsoluteMagnitude', 'StarType', 'StarColor', 'SpectralClass']

#header=0, para especificar que la 1ra fila es la de los names
df_data = pd.read_csv('./dataset.csv', header=0, names=names)
df_data

Unnamed: 0,Temperature,Luminosity,Radius,AbsoluteMagnitude,StarType,StarColor,SpectralClass
0,3068,0.002400,0.1700,16.12,0,Red,M
1,3042,0.000500,0.1542,16.60,0,Red,M
2,2600,0.000300,0.1020,18.70,0,Red,M
3,2800,0.000200,0.1600,16.65,0,Red,M
4,1939,0.000138,0.1030,20.06,0,Red,M
...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,5,Blue,O
236,30839,834042.000000,1194.0000,-10.63,5,Blue,O
237,8829,537493.000000,1423.0000,-10.73,5,White,A
238,9235,404940.000000,1112.0000,-11.23,5,White,A


# Análisis exploratorio
___

> Comprobar conformidad en los datos (i.e., Dtype) y conocer features

In [3]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Temperature        240 non-null    int64  
 1   Luminosity         240 non-null    float64
 2   Radius             240 non-null    float64
 3   AbsoluteMagnitude  240 non-null    float64
 4   StarType           240 non-null    int64  
 5   StarColor          240 non-null    object 
 6   SpectralClass      240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.2+ KB


> Conocer las categorías a clasificar

In [4]:
df_data['StarType'].unique()

array([0, 1, 2, 3, 4, 5])

> Resumen info de variables numéricas

In [5]:
df_data.describe()

Unnamed: 0,Temperature,Luminosity,Radius,AbsoluteMagnitude,StarType
count,240.0,240.0,240.0,240.0,240.0
mean,10497.4625,107188.361635,237.157781,4.382396,2.5
std,9552.425037,179432.24494,517.155763,10.532512,1.711394
min,1939.0,8e-05,0.0084,-11.92,0.0
25%,3344.25,0.000865,0.10275,-6.2325,1.0
50%,5776.0,0.0705,0.7625,8.313,2.5
75%,15055.5,198050.0,42.75,13.6975,4.0
max,40000.0,849420.0,1948.5,20.06,5.0


# Limpieza de datos
___

> Ajustar tipos de datos (Asegurar conformidad)

- Cambio tipos de datos en features incoherentes

> Eliminar Nulos

- Compruebo Nulos

In [6]:
df_data.isnull().sum()

Temperature          0
Luminosity           0
Radius               0
AbsoluteMagnitude    0
StarType             0
StarColor            0
SpectralClass        0
dtype: int64

- Elimino Nulos

> Eliminar registros duplicados

- Eliminar duplicados

In [7]:
df_data.drop_duplicates(inplace = True)

- Comprobar si algún registro fue eliminado

In [8]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Temperature        240 non-null    int64  
 1   Luminosity         240 non-null    float64
 2   Radius             240 non-null    float64
 3   AbsoluteMagnitude  240 non-null    float64
 4   StarType           240 non-null    int64  
 5   StarColor          240 non-null    object 
 6   SpectralClass      240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 15.0+ KB


> Normalizar variables categóricas

- Comprobar si categorías de cada variable categórica están normalizadas

In [10]:
list(np.unique(df_data['StarColor']))

['Blue',
 'Blue ',
 'Blue White',
 'Blue white',
 'Blue white ',
 'Blue-White',
 'Blue-white',
 'Orange',
 'Orange-Red',
 'Pale yellow orange',
 'Red',
 'White',
 'White-Yellow',
 'Whitish',
 'Yellowish',
 'Yellowish White',
 'white',
 'yellow-white',
 'yellowish']

In [11]:
list(np.unique(df_data['SpectralClass']))

['A', 'B', 'F', 'G', 'K', 'M', 'O']

- Nomalizar variables categóricas (quito espacios en esquinas, separo palabras con '-' y paso todo a minúscula)

In [21]:
# Quito espacios al final
df_data['StarColor'] = df_data['StarColor'].str.strip()

In [22]:
# Cambio el separador, de espacios a guiones
df_data['StarColor'] = df_data['StarColor'].str.replace(' ', '-')

In [24]:
df_data['StarColor'] = df_data['StarColor'].str.lower()

In [25]:
# Verifico que las clases hayan sido normalizadas
list(np.unique(df_data['StarColor']))

['blue',
 'blue-white',
 'orange',
 'orange-red',
 'pale-yellow-orange',
 'red',
 'white',
 'white-yellow',
 'whitish',
 'yellow-white',
 'yellowish',
 'yellowish-white']

# Exportar dataset clean
___

In [28]:
df_data.to_csv("dataset_clean.csv", sep=',')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=dbb742f7-e9e6-418d-9e02-197ef81a0ca5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>