In [1]:
import pandas as pd

# Load the dataset
file_path = 'starfield.csv'  # Replace with your actual file path


In [2]:
# Columns that are needed
required_columns = ['hip', 'dist', 'x0', 'y0', 'z0', 'absmag', 'mag', 'vx', 'vy', 'vz', 'spect']

# Read the CSV file using pandas
df = pd.read_csv(file_path, usecols=required_columns)

In [3]:

# Remove rows with missing values in 'X0', 'Y0', 'Z0', or 'SPECT'
df.dropna(subset=['x0', 'y0', 'z0', 'spect'], inplace=True)



In [4]:
# Convert velocities from km/s to parsecs per year
velocity_columns = ['vx', 'vy', 'vz']

    # Assuming df is a pandas DataFrame with the velocity columns in km/s
for col in velocity_columns:
    df[col] = df[col].apply(lambda x: "{:.10f}".format(x * 1.02269e-6))

# Now the 'vx', 'vy', 'vz' columns will have values as strings formatted to 10 decimal places without using scientific notation.
df.head() 

Unnamed: 0,hip,dist,x0,y0,z0,mag,absmag,vx,vy,vz,spect
0,,0.0,5e-06,0.0,0.0,-26.7,4.85,,,,G2 V
1,,509.1956,506.856,0.003,-48.754,9.239,0.705,-2.03556e-05,5.55382e-05,-2.60684e-05,G8 IV
2,1.0,219.7802,219.741,0.003,4.177,9.1,2.39,3.78e-08,-5.5409e-06,-2.0024e-06,F3 V
13,,817.1342,696.482,0.041,427.341,9.013,-0.548,4.38018e-05,3.67054e-05,4.1981e-06,K2
20,2.0,37.3164,35.176,0.003,-12.456,9.27,6.411,6.5196e-06,3.25287e-05,-2.5097e-06,K3 V


In [5]:
# Extract the first character of the spectral type (OBAFGKM)
df['spect'] = df['spect'].str.extract(r'([OBAFGKM])')


In [20]:
# Save the cleaned and processed data to a new CSV file
output_file_path = 'cleaned_stardataset.csv'
df.to_csv(output_file_path, index=False)

In [6]:
# Now, calculate the specifications
specifications = df.describe().loc[['mean', 'max', 'min']]

# Additionally, to provide more insights, let's include standard deviation to understand the spread of values
specifications.loc['std'] = df.describe().loc['std']

specifications

Unnamed: 0,hip,dist,x0,y0,z0,mag,absmag
mean,30580.730321,452.009663,-52.676267,230.176513,-30.380964,9.179449,1.659476
max,120404.0,82083.9594,23668.645,30790.305,8146.862,13.057,14.265
min,1.0,0.0,-6569.188,-631.323,-78270.936,-26.7,-9.614
std,17674.438359,659.451862,314.080298,344.678502,604.203284,1.12973,1.844378
