# Data pipelines - Exoplanets Exercise

### Objective: perform aggregations on the planets dataset.

In [1]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [2]:
import pandas as pd

row_count = planets.shape[0]  # Count the initial amount of rows
print("Number of rows before removing null values:", row_count)

planets.dropna(inplace=True)  # Remove rows with null values
row_count = planets.shape[0]  # Count the remaining rows

print("Number of rows after removing null values:", row_count)

Number of rows before removing null values: 1035
Number of rows after removing null values: 498


**Mean** of mass column

In [3]:
print(f"Calculating mean of mass directly (sum/count): mean = {planets['mass'].sum()/planets.shape[0]}")
print(f"Using inbuilt functions: mean = {planets['mass'].mean()}")

Calculating mean of mass directly (sum/count): mean = 2.5093200401606426
Using inbuilt functions: mean = 2.5093200401606426


**Median** of year column

In [4]:
mid_idx = (planets['year'].shape[0]-1)//2
median = sorted(list(planets['year']))[mid_idx]

print(f"Calculating median of year directly: median = {median}")
print(f"Using inbuilt functions: median = {planets['year'].median()}")

Calculating median of year directly: median = 2009
Using inbuilt functions: median = 2009.0


**Mode** of number column

In [5]:
# Mode is trickier to do without inbuilt functions

column_values = planets['number']

# Create a dictionary to store the frequency count of each value
value_counts = {}

for value in column_values:
    if value in value_counts:
        value_counts[value] += 1
    else:
        value_counts[value] = 1

# Find the values with the highest frequency (mode)
modes = []
max_count = max(value_counts.values())
for value, count in value_counts.items():
    if count == max_count:
        modes.append(value)

# Print the mode(s)
print(f"Calculating modes of number directly: modes = {modes}")

print(f"Using inbuilt functions: modes = {column_values.mode().values}")

Calculating modes of number directly: modes = [1]
Using inbuilt functions: modes = [1]


In [6]:
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [7]:
planets.groupby(by=['method'])['orbital_period'].mean()

method
Radial Velocity    837.454878
Transit              2.703390
Name: orbital_period, dtype: float64

In [8]:
planets['method'].value_counts()

Radial Velocity    497
Transit              1
Name: method, dtype: int64

Radial velocity gives the best average value as it is calculated over many different examples (there is only a single  planet with which the transit detection method was used).

In [9]:
decades = (planets['year'] // 10) * 10  # Calculate the decade for each planet
planets['decade'] = decades
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year,decade
0,Radial Velocity,1,269.30000,7.100,77.40,2006,2000
1,Radial Velocity,1,874.77400,2.210,56.95,2008,2000
2,Radial Velocity,1,763.00000,2.600,19.84,2011,2010
3,Radial Velocity,1,326.03000,19.400,110.62,2007,2000
4,Radial Velocity,1,516.22000,10.500,119.47,2009,2000
...,...,...,...,...,...,...,...
640,Radial Velocity,1,111.70000,2.100,14.90,2009,2000
641,Radial Velocity,1,5.05050,1.068,44.46,2013,2010
642,Radial Velocity,1,311.28800,1.940,17.24,1999,1990
649,Transit,1,2.70339,1.470,178.00,2013,2010


In [10]:
counts = planets.groupby(['decade', 'method'])['number'].sum()  # Group the counts by decade and method

print("Counts of discovered planets by decade and method:")
counts

Counts of discovered planets by decade and method:


decade  method         
1980    Radial Velocity      1
1990    Radial Velocity     49
2000    Radial Velocity    439
2010    Radial Velocity    374
        Transit              1
Name: number, dtype: int64