In [4]:
import pandas as pd
import numpy as np

# HW2 Review

## DataFrames, Loops, and Conditionals for Astronomical Data Analysis

### Overview
This homework will build on the skills learned in Sessions 1 and 2, focusing on using pandas DataFrames and control structures (for loops, if statements, while loops) to analyze astronomical data. You'll work with a dataset of stars and perform various analyses.

### Part 1: Setting Up Your Data

**Task 1.1: Create a Star Catalog DataFrame**

Create a pandas DataFrame containing information about at least 8 stars with the following columns:
- `name` (string): Star name
- `distance_ly` (float): Distance in light-years
- `spectral_type` (string): O, B, A, F, G, K, or M
- `temperature_K` (integer): Surface temperature in Kelvin
- `mass_solar` (float): Mass in solar masses
- `luminosity_solar` (float): Luminosity relative to the Sun
- `has_planets` (boolean): Whether the star has known planets

In [32]:
# This dictionary includes mock data
# Defining the dict according to the instruction

stars_data = {
    'name': ['star1', 'star2', 'star3', 'star4', 'star5'],
    'distance_ly': [582, 39, 4.2, 101.5, 21.3],
    'spectral_type': ["A", "M", "G", "K", "M"],
    'temperature_K': [9600, 2200, 7550, 5000, 3000],
    'mass_solar': [2, 0.3, 1, 0.9, 0.4],
    'luminosity_solar': [40, 0.5, 1, 1.1, 0.7],
    'has_planets': [True, True, True, True, False]
}


In [33]:
# Defining the Pandas DataFrame

stars_df = pd.DataFrame(stars_data)

In [34]:
# Displaying the DataFrame

stars_df

Unnamed: 0,name,distance_ly,spectral_type,temperature_K,mass_solar,luminosity_solar,has_planets
0,star1,582.0,A,9600,2.0,40.0,True
1,star2,39.0,M,2200,0.3,0.5,True
2,star3,4.2,G,7550,1.0,1.0,True
3,star4,101.5,K,5000,0.9,1.1,True
4,star5,21.3,M,3000,0.4,0.7,False


**Task 1.2: Add Calculated Columns**

Add two new columns to your DataFrame:
- `distance_parsecs`: Convert the distance from light-years to parsecs (1 ly ≈ 0.307 parsecs)
- `absolute_magnitude`: Calculate using the formula:
  M = 4.83 - 2.5 * log10(luminosity_solar)

Display the resulting DataFrame.

In [35]:
# Creating a new column

stars_df['distance_parsecs'] = stars_df.distance_ly * 0.307

In [36]:
# Creating a new column

stars_df['absolute_magnitude'] = 4.83 - 2.5*(np.log10(stars_df.luminosity_solar))

In [11]:
stars_df

Unnamed: 0,name,distance_ly,spectral_type,temperature_K,mass_solar,luminosity_solar,has_planets,distance_parsecs,absolute_magnitude
0,star1,582.0,A,9600,2.0,40.0,True,178.674,0.82485
1,star2,39.0,M,2200,0.3,0.5,True,11.973,5.582575
2,star3,4.2,G,7550,1.0,1.0,True,1.2894,4.83
3,star4,101.5,K,5000,0.9,1.1,True,31.1605,4.726518
4,star5,21.3,M,3000,0.4,0.7,False,6.5391,5.217255


### Part 2: Loop-Based Analysis

**Task 2.1: Star Classification Loop**

Write a for loop that iterates through each star in your DataFrame and classifies it based on temperature:
- O stars: > 30,000 K
- B stars: 10,000 - 30,000 K
- A stars: 7,500 - 10,000 K
- F stars: 6,000 - 7,500 K
- G stars: 5,200 - 6,000 K
- K stars: 3,700 - 5,200 K
- M stars: < 3,700 K

Compare your classification with the spectral_type column and print whether they match.

In [37]:
# I define a new empty list
# In the loop, at each iteration, if the temprature meets 
# a category temperature range, I add the spectral type to 
# class_new

class_new = []

for i in range(len(stars_df)):
    if stars_df.temperature_K[i]>30000:
        class_new.append('O')
    elif (stars_df.temperature_K[i]<30000) and (stars_df.temperature_K[i]>10000):
        class_new.append('B')
    elif (stars_df.temperature_K[i]<10000) and (stars_df.temperature_K[i]>7500):
        class_new.append('A')
    elif (stars_df.temperature_K[i]<7500) and (stars_df.temperature_K[i]>6000):
        class_new.append('F')
    elif (stars_df.temperature_K[i]<6000) and (stars_df.temperature_K[i]>5200):
        class_new.append('G')
    elif (stars_df.temperature_K[i]<5200) and (stars_df.temperature_K[i]>3700):
        class_new.append('K')
    else:
        class_new.append('M')

In [38]:
# class_new is added as a new column to the stars_df DataFrame
stars_df['type_new'] = class_new

In [39]:
# Displaying two columns of the DataFrame to compare
stars_df[['spectral_type', 'type_new']]

Unnamed: 0,spectral_type,type_new
0,A,A
1,M,M
2,G,A
3,K,K
4,M,M


**Task 2.2: Star Statistics**

Using loops and conditionals, determine and print:
- The number of stars in each spectral class
- The average mass for each spectral class
- The star(s) with the highest luminosity
- All stars within 20 light-years that have planets

In [40]:
# I define counter as an array of shape (7, )
# each elemnt of counter is a counter for each spectral class
# In the loop I check each star's spectral class and 
# increase its counter by one
# The spectral types are:
# O, B, A, F, G, K, or M

counter = np.zeros((7))

for i in range(len(stars_df)):
    if stars_df.type_new[i]== 'O':
        counter[0] += 1
    elif stars_df.type_new[i] == 'B':
        counter[1] += 1
    elif stars_df.type_new[i] == 'A':
        counter[2] += 1
    elif stars_df.type_new[i] == 'F':
        counter[3] += 1
    elif stars_df.type_new[i] == 'G':
        counter[4] += 1
    elif stars_df.type_new[i] == 'K':
        counter[5] += 1
    else:
        counter[6] += 1
        
print("Number of stars for spectral types O, B, A, F, G, K, and M are: %i, %i, %i, %i, %i, %i, %i"%(counter[0],
                                                                                                    counter[1],
                                                                                                    counter[2],
                                                                                                   counter[3],
                                                                                                   counter[4],
                                                                                                   counter[5],
                                                                                                   counter[6]))

Number of stars for spectral types O, B, A, F, G, K, and M are: 0, 0, 2, 0, 0, 1, 2


In [41]:
# Another way to see how many unique elements there are 
# in a specific column

stars_df.groupby('spectral_type').size()

spectral_type
A    1
G    1
K    1
M    2
dtype: int64

In [43]:
# I define counter and sum_mass as an array of shape (7, )
# In the loop, I check each star's spectral class and 
# increase its counter by one and add its mass to sum_mass
# Finally, I divide sum_mass by counter to get 
# average mass per spectral class

# The spectral types are:
# O, B, A, F, G, K, or M

sum_mass = np.zeros((7))
counter = np.zeros((7))

for i in range(len(stars_df)):
    if stars_df.type_new[i]== 'O':
        counter[0] += 1
        sum_mass[0] += stars_df.mass_solar[i]
    elif stars_df.type_new[i] == 'B':
        counter[1] += 1
        sum_mass[1] += stars_df.mass_solar[i]
    elif stars_df.type_new[i] == 'A':
        counter[2] += 1
        sum_mass[2] += stars_df.mass_solar[i]
    elif stars_df.type_new[i] == 'F':
        counter[3] += 1
        sum_mass[3] += stars_df.mass_solar[i]
    elif stars_df.type_new[i] == 'G':
        counter[4] += 1
        sum_mass[4] += stars_df.mass_solar[i]
    elif stars_df.type_new[i] == 'K':
        counter[5] += 1
        sum_mass[5] += stars_df.mass_solar[i]
    else:
        counter[6] += 1
        sum_mass[6] += stars_df.mass_solar[i]
        
print("Average mass of stars for spectral types O, B, A, F, G, K, and M are: %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f solar mass"%(sum_mass[0]/counter[0],
                                                                                                    sum_mass[1]/counter[1],
                                                                                                    sum_mass[2]/counter[2],
                                                                                                   sum_mass[3]/counter[3],
                                                                                                   sum_mass[4]/counter[4],
                                                                                                   sum_mass[5]/counter[5],
                                                                                                   sum_mass[6]/counter[6]))

Average mass of stars for spectral types O, B, A, F, G, K, and M are: nan, nan, 1.50, nan, nan, 0.90, 0.35 solar mass


  print("Average mass of stars for spectral types O, B, A, F, G, K, and M are: %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f solar mass"%(sum_mass[0]/counter[0],
  sum_mass[1]/counter[1],
  sum_mass[3]/counter[3],
  sum_mass[4]/counter[4],


In [44]:
# Another way of getting the mean of all columns of the stars_df
# per unique element in one column
stars_df.groupby('spectral_type').mean()

  stars_df.groupby('spectral_type').mean()


Unnamed: 0_level_0,distance_ly,temperature_K,mass_solar,luminosity_solar,has_planets,distance_parsecs,absolute_magnitude
spectral_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,582.0,9600.0,2.0,40.0,1.0,178.674,0.82485
G,4.2,7550.0,1.0,1.0,1.0,1.2894,4.83
K,101.5,5000.0,0.9,1.1,1.0,31.1605,4.726518
M,30.15,2600.0,0.35,0.6,0.5,9.25605,5.399915


In [45]:
# Find the star(s) with the highest luminosity

#np.argmax or np.argmin gives us the index of the maximum or minimum value 
# in an array
# We then use that index to find the values of the row in which the maximum of 
# one column exists in

stars_df.name[np.argmax(stars_df.luminosity_solar)]

'star1'

In [46]:
# The same as above using iloc

stars_df.iloc[np.argmax(stars_df.luminosity_solar)]

name                    star1
distance_ly             582.0
spectral_type               A
temperature_K            9600
mass_solar                2.0
luminosity_solar         40.0
has_planets              True
distance_parsecs      178.674
absolute_magnitude    0.82485
type_new                    A
Name: 0, dtype: object

In [47]:
# Find all stars within 20 light-years that have planets

for i in range(len(stars_df)):
    if (stars_df.distance_ly[i]<20) and (stars_df.has_planets[i]):
        print(stars_df.iloc[i])

name                   star3
distance_ly              4.2
spectral_type              G
temperature_K           7550
mass_solar               1.0
luminosity_solar         1.0
has_planets             True
distance_parsecs      1.2894
absolute_magnitude      4.83
type_new                   A
Name: 2, dtype: object
