In [3]:
import pandas as pd

# Load the dataset
file_path = 'extra_star.csv'  # Replace with your actual file path


In [5]:
# Columns that are needed
required_columns = ['hip', 'dist', 'x0', 'y0', 'z0', 'absmag', 'mag', 'vx', 'vy', 'vz', 'spect']

# Read the CSV file using pandas
df = pd.read_csv(file_path, usecols=required_columns)

In [6]:
# Convert velocities from km/s to parsecs per year
velocity_columns = ['vx', 'vy', 'vz']

    # Assuming df is a pandas DataFrame with the velocity columns in km/s
for col in velocity_columns:
    df[col] = df[col].apply(lambda x: "{:.10f}".format(x * 1.02269e-6))

# Now the 'vx', 'vy', 'vz' columns will have values as strings formatted to 10 decimal places without using scientific notation.
df.head() 

Unnamed: 0,hip,dist,x0,y0,z0,absmag,mag,vx,vy,vz,spect
0,0,0.0,5e-06,0.0,0.0,4.85,-26.7,0.0,0.0,0.0,G
1,0,509.1956,506.856,0.003,-48.754,0.705,9.239,-2.03556e-05,5.55382e-05,-2.60684e-05,G
2,1,219.7802,219.741,0.003,4.177,2.39,9.1,3.78e-08,-5.5409e-06,-2.0024e-06,F
3,0,817.1342,696.482,0.041,427.341,-0.548,9.013,4.38018e-05,3.67054e-05,4.1981e-06,K
4,2,37.3164,35.176,0.003,-12.456,6.411,9.27,6.5196e-06,3.25287e-05,-2.5097e-06,K


In [7]:
# Extract the first character of the spectral type (OBAFGKM)
df['spect'] = df['spect'].str.extract(r'([OBAFGKM])')


In [26]:
# Save the cleaned and processed data to a new CSV file
output_file_path = 'ex_star.csv'
df.to_csv(output_file_path, index=False)

In [4]:
import pandas as pd
import numpy as np

In [28]:
# Load the CSV file
df = pd.read_csv('exoplanet.csv', skiprows = 96)

# Check and remove duplicate planet names in 'pl_name'
df_cleaned = df.drop_duplicates(subset=['pl_name'])

In [37]:
# If the spectral type is NaN, use 'F' as default
df_cleaned['color'] = df_cleaned['st_spectype'].astype(str).str[0].fillna('F')

# Assuming 'hostname' or 'pl_name' might give an indication of the star system which might include HIP numbers
# hip_numbers = df['hostname']  # If this column contains Hipparcos numbers

# For position, converting RA, Dec, and Distance to XYZ is a bit complex and requires astrophysical calculations
# This is just to illustrate extracting the columns; converting RA/Dec/Distance to XYZ is not covered here
ra = df_cleaned['ra']
dec = df_cleaned['dec']
distance = df_cleaned['sy_dist']
color = df_cleaned['color']
# Since velocity isn't directly mentioned, we cannot extract it without knowing the exact column or having the data

# Get the maximum and minimum values of the column
max_value = df_cleaned['pl_bmasse'].max()
min_value = df_cleaned['pl_bmasse'].min()

# Scale the values between 0 and 1
df_cleaned['pl_bmasse_scaled'] = (df_cleaned['pl_bmasse'] - min_value) / (max_value - min_value)
size = df_cleaned['pl_bmasse_scaled']

# Example of what you might do with the extracted data, showing the head of the dataframe
print(color.head())
# print(hip_numbers.head())  # Uncomment if 'hostname' or 'pl_name' is used for HIP
print(ra.head())
print(dec.head())
print(distance.head())
print(size.head())

0     n
3     n
6     K
9     K
17    n
Name: color, dtype: object
0     185.178779
3     229.274595
6     352.824150
9     242.602101
17    295.465642
Name: ra, dtype: float64
0     17.793252
3     71.823943
6     39.235837
9     43.816362
17    50.516824
Name: dec, dtype: float64
0      93.1846
3     125.3210
6      75.4392
9      17.9323
17     21.1397
Name: sy_dist, dtype: float64
0     0.213742
3     0.184250
6     0.044487
9     0.057997
17    0.021899
Name: pl_bmasse_scaled, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['color'] = df_cleaned['st_spectype'].astype(str).str[0].fillna('F')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['pl_bmasse_scaled'] = (df_cleaned['pl_bmasse'] - min_value) / (max_value - min_value)


In [38]:
# Convert RA, Dec, and distance to Cartesian coordinates (X, Y, Z) in parsecs
# The equations for conversion are as follows:
# X = distance * cos(Dec) * cos(RA)
# Y = distance * cos(Dec) * sin(RA)
# Z = distance * sin(Dec)
# Note: RA and Dec need to be converted from degrees to radians for the trigonometric functions

def ra_dec_dist_to_xyz(ra, dec, distance):
    # Convert from degrees to radians
    ra_rad = np.radians(ra)
    dec_rad = np.radians(dec)
    
    # Calculate X, Y, Z
    X = distance * np.cos(dec_rad) * np.cos(ra_rad)
    Y = distance * np.cos(dec_rad) * np.sin(ra_rad)
    Z = distance * np.sin(dec_rad)
    
    return X, Y, Z

In [43]:
# Apply the conversion to each row in the DataFrame
df_cleaned['X'], df_cleaned['Y'], df_cleaned['Z'] = ra_dec_dist_to_xyz(df_cleaned['ra'], df_cleaned['dec'], df_cleaned['sy_dist'])

df_cleaned[['pl_name','ra', 'dec', 'sy_dist', 'X', 'Y', 'Z', 'pl_bmasse', 'color']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['X'], df_cleaned['Y'], df_cleaned['Z'] = ra_dec_dist_to_xyz(df_cleaned['ra'], df_cleaned['dec'], df_cleaned['sy_dist'])


Unnamed: 0,pl_name,ra,dec,sy_dist,X,Y,Z,pl_bmasse,color
0,11 Com b,185.178779,17.793252,93.1846,-88.364958,-8.008843,28.475644,5434.70000,n
3,11 UMi b,229.274595,71.823943,125.3210,-25.505212,-29.625963,119.067794,4684.81420,n
6,14 And b,352.824150,39.235837,75.4392,57.973692,-7.298956,47.716341,1131.15130,K
9,14 Her b,242.602101,43.816362,17.9323,-5.954231,-11.487907,12.415415,1474.67000,K
17,16 Cyg B b,295.465642,50.516824,21.1397,5.779530,-12.135760,16.315860,556.83537,n
...,...,...,...,...,...,...,...,...,...
35860,ups And b,24.198353,41.403815,13.4054,9.171439,4.121491,8.865820,216.12440,F
35867,ups And c,24.198353,41.403815,13.4054,9.171439,4.121491,8.865820,4443.24113,F
35873,ups And d,24.198353,41.403815,13.4054,9.171439,4.121491,8.865820,1303.09647,n
35879,ups Leo b,174.237219,-0.823564,52.5973,-52.326075,5.280750,-0.756003,162.09249,G


In [1]:
# Selecting only the required columns along with 'pl_name'
df_final = df_cleaned[['pl_name','ra', 'dec', 'sy_dist', 'X', 'Y', 'Z', 'pl_bmasse', 'color']]
# Save the final cleaned data to a new CSV file
df_final.to_csv('ex_planet.csv', index=False)

NameError: name 'df_cleaned' is not defined

In [5]:
# Load the CSV file
df = pd.read_csv('exx.csv', skiprows = 9)

df

Unnamed: 0,hostname,hip_name,sy_pnum
0,11 Com,HIP 60202,1
1,11 Com,HIP 60202,1
2,11 Com,HIP 60202,1
3,11 UMi,HIP 74793,1
4,11 UMi,HIP 74793,1
...,...,...,...
35891,ups And,HIP 7513,3
35892,ups Leo,,1
35893,xi Aql,HIP 97938,1
35894,xi Aql,HIP 97938,1


In [6]:
# Drop rows with NaN hip_name values
df.dropna(subset=['hip_name'], inplace=True)

# Extract numbers from the hip_name column
df['hip_name'] = df['hip_name'].str.extract(r'(\d+)')

In [9]:
# Sort the DataFrame by 'hip_name'
df.sort_values(by='hip_name', inplace=True)

In [10]:
df

Unnamed: 0,hostname,hip_name,sy_pnum
574,GJ 3138,10037,3
573,GJ 3138,10037,3
572,GJ 3138,10037,3
2040,HD 13189,10085,1
2041,HD 13189,10085,1
...,...,...,...
2595,HD 192310,99825,2
2599,HD 192699,99894,1
2600,HD 192699,99894,1
2598,HD 192699,99894,1


In [24]:
# Fill NaN values in st_spectype column with 'F'
df['st_spectype'].fillna('F', inplace=True)

In [25]:
df

Unnamed: 0,pl_name,hip_name,st_spectype
574,GJ 3138 d,10037,M0
573,GJ 3138 c,10037,M0
572,GJ 3138 b,10037,M0
2040,HD 13189 b,10085,F
2041,HD 13189 b,10085,F
...,...,...,...
2597,HD 192310 c,99825,F
2598,HD 192699 b,99894,F
2599,HD 192699 b,99894,F
2600,HD 192699 b,99894,F


In [11]:
# Remove duplicates and keep the first occurrence of each hip_name
df.drop_duplicates(subset=['hip_name'], keep='first', inplace=True)

In [12]:
df

Unnamed: 0,hostname,hip_name,sy_pnum
574,GJ 3138,10037,3
2040,HD 13189,10085,1
2603,HD 195019,100970,1
33914,TOI-262,10117,1
5662,KELT-9,101252,1
...,...,...,...
797,GJ 9689,99699,1
2587,HD 192263,99711,1
3785,HIP 99770,99770,1
2596,HD 192310,99825,2


In [13]:
df

Unnamed: 0,hostname,hip_name,sy_pnum
574,GJ 3138,10037,3
2040,HD 13189,10085,1
2603,HD 195019,100970,1
33914,TOI-262,10117,1
5662,KELT-9,101252,1
...,...,...,...
797,GJ 9689,99699,1
2587,HD 192263,99711,1
3785,HIP 99770,99770,1
2596,HD 192310,99825,2


In [15]:
# Keep only the first letter of st_spectype column
df['st_spectype'] = df['st_spectype'].str[0]

In [16]:
df

Unnamed: 0,pl_name,hip_name,st_spectype
574,GJ 3138 d,10037,M
2040,HD 13189 b,10085,F
2609,HD 195019 b,100970,F
33914,TOI-262 b,10117,F
5658,KELT-9 b,101252,F
...,...,...,...
797,GJ 9689 b,99699,M
2589,HD 192263 b,99711,K
3785,HIP 99770 b,99770,A
2594,HD 192310 b,99825,F


In [14]:
# Save the final DataFrame to a new CSV file
df.to_csv('final_exx.csv', index=False)

In [29]:
import json

# Load the original JSON file
with open('starsGOinfo.json', 'r') as f:
    data = json.load(f)

# Get the list of stars
stars = data["stars"]

# Split the stars into two equal parts
half_length = len(stars) // 2
part1 = {"stars": stars[:half_length]}
part2 = {"stars": stars[half_length:]}

# Write the first part to a new JSON file
with open('part1.json', 'w') as f:
    json.dump(part1, f)

# Write the second part to another JSON file
with open('part2.json', 'w') as f:
    json.dump(part2, f)



In [30]:
# Load part 1 JSON file
with open('part1.json', 'r') as f:
    part1_data = json.load(f)

# Load part 2 JSON file
with open('part2.json', 'r') as f:
    part2_data = json.load(f)

# Count the number of stars in each part
part1_star_count = len(part1_data["stars"])
part2_star_count = len(part2_data["stars"])

print("Number of stars in part 1:", part1_star_count)
print("Number of stars in part 2:", part2_star_count)

Number of stars in part 1: 53762
Number of stars in part 2: 53762


In [32]:
import json

# Load the original JSON file
with open('starsGOinfo.json', 'r') as f:
    data = json.load(f)

# Write the data to a new JSON file with reduced overhead
with open('reduced_starsGOinfo.json', 'w') as f:
    json.dump(data, f, separators=(',', ':'))
