In [1]:
# Instalar todas las librerías necesarias para el notebook
!pip install pandas selenium matplotlib openpyxl




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\sigli\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By 
import matplotlib.pyplot as plt

In [3]:
base_url = "https://nextspaceflight.com/launches/past/"
last_page = 2  # 239 el total
def scrape_details(browser, url): 
    browser.get(url)
    details_grid = browser.find_elements(By.CLASS_NAME, "mdl-grid.a")[1]  
    details = details_grid.text 
    browser.back()
    return details

In [4]:
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")  # Ejecutar en modo headless (sin abrir una ventana del navegador)
browser = webdriver.Chrome(options=options) # generamos un navegador automatizado

launch_data = []
for page in range(1, last_page + 1):  
    browser.get(f"{base_url}?page={page}") # obtenemos el html (hace la petición), sin selenium sería la librería de requests
    cards = browser.find_elements(By.CLASS_NAME, "mdl-card")
    for card in cards:
        company = card.find_element(By.CLASS_NAME, "mdl-card__title-text").text # type() -> string
        rocket_and_payload = card.find_element(By.CLASS_NAME, "header-style").text # type() -> string
        date_and_location = card.find_element(By.CLASS_NAME, "mdl-card__supporting-text").text # type() -> string
        details_url = card.find_element(By.CLASS_NAME, "mdc-button").get_attribute("href") # type() -> string
        details = scrape_details(browser, details_url)
        launch_dict = {'company': company, 
                    'rocket_and_payload': rocket_and_payload, 
                    'date_and_location': date_and_location, 
                    'details_url': details_url,
                       'details': details}
        launch_data.append(launch_dict)
    
browser.quit()

df = pd.DataFrame(launch_data, columns=['company', 'rocket_and_payload', 'date_and_location', 'details_url', 'details'])



In [5]:
#Diccionario para extraer los datos de details
def parse_details(details_text):
    details_dict = {}
    for line in details_text.split('\n'):
        if ':' in line:
            key, value = line.split(':', 1)
            details_dict[key.strip()] = value.strip()
    return details_dict

In [6]:
#Creamos columnas nuev columnas nuevas automáticamente
details_expanded = df['details'].apply(parse_details).apply(pd.Series)
df = pd.concat([df, details_expanded], axis=1)


In [7]:
# Reemplazar NaN, None o 0 por el promedio de cada columna relevante
import re
import numpy as np
def extraer_numero(payload):
    if isinstance(payload, str):
        match = re.search(r'(\d+[\,\.]?\d*)', payload.replace(',', '.'))
        if match:
            try:
                return float(match.group(1))
            except:
                return np.nan
    if payload is None:
        return np.nan
    return payload if isinstance(payload, (int, float)) else np.nan

# Eliminar columnas duplicadas si existen
df = df.loc[:,~df.columns.duplicated()]

#de aqui en adelantev es IA GitHub Copilot

# Extraer y limpiar columnas numéricas
for col in ['Rocket Height', 'Fairing diameter', 'Fairing Height', 'Payload to LEO', 'Payload to GTO']:
    if col in df.columns:
        df[col+'_num'] = df[col].apply(extraer_numero)

# Mostrar antes de reemplazar

for col in ['Rocket Height_num', 'Fairing diameter_num', 'Fairing Height_num', 'Payload to LEO_num', 'Payload to GTO_num']:
    if col in df.columns:
        print(col, df[col].head())

# Reemplazar 0, NaN o None por el promedio de la columna para todas excepto Fairing diameter y Fairing Height
for col in ['Rocket Height_num', 'Payload to LEO_num', 'Payload to GTO_num']:
    if col in df.columns:
        vals = df[col].replace(0, np.nan)
        mean = vals.mean(skipna=True)
        df[col] = vals.fillna(mean)

# Para Fairing diameter y Fairing Height: solo reemplazar NaN por el promedio (no los ceros)
for col in ['Fairing diameter_num', 'Fairing Height_num']:
    if col in df.columns:
        vals = df[col]
        mean = vals.mean(skipna=True)
        df[col] = vals.fillna(mean)

# Mostrar después de reemplazar
print('Después de reemplazar NaN:')
for col in ['Rocket Height_num', 'Fairing diameter_num', 'Fairing Height_num', 'Payload to LEO_num', 'Payload to GTO_num']:
    if col in df.columns:
        print(col, df[col].head())

# Asignar columnas limpias a nombres estándar para el cálculo
df['LEO_kg'] = df['Payload to LEO_num'] if 'Payload to LEO_num' in df.columns else 0
df['GTO_kg'] = df['Payload to GTO_num'] if 'Payload to GTO_num' in df.columns else 0

df['Payload_total_kg'] = df['LEO_kg'] + df['GTO_kg']

# Calcular solo el volumen total del cohete usando las columnas correctas
def volumen_total(row):
    diam = row['Fairing diameter_num'] if 'Fairing diameter_num' in row and not np.isnan(row['Fairing diameter_num']) else 0
    altura_cono = row['Fairing Height_num'] if 'Fairing Height_num' in row and not np.isnan(row['Fairing Height_num']) else 0
    altura_total = row['Rocket Height_num'] if 'Rocket Height_num' in row and not np.isnan(row['Rocket Height_num']) else 0
    altura_cil = altura_total - altura_cono if altura_total > altura_cono else 0
    if diam > 0 and (altura_cil > 0 or altura_cono > 0):
        vol_cil = np.pi * (diam/2)**2 * altura_cil
        vol_cono = (1/3) * np.pi * (diam/2)**2 * altura_cono
        return vol_cil + vol_cono
    return 0

df['Volumen_total_m3'] = df.apply(volumen_total, axis=1)

# Eliminar columnas intermedias de volumen y auxiliares si existen
df.drop(columns=["details", "Payload to LEO", "Payload to GTO", "Volumen_cilindro_m3", "Volumen_cono_m3", 'Rocket Height_num', 'Fairing diameter_num', 'Fairing Height_num', 'Payload to LEO_num', 'Payload to GTO_num'], inplace=True, errors='ignore')
df


Rocket Height_num 0    70.00
1    51.38
2    70.00
3    58.00
4    31.00
Name: Rocket Height_num, dtype: float64
Fairing Height_num 0    13.00
1    15.59
2    13.00
3    12.40
4      NaN
Name: Fairing Height_num, dtype: float64
Payload to LEO_num 0    22.8
1     7.5
2    22.8
3    12.0
4     1.5
Name: Payload to LEO_num, dtype: float64
Payload to GTO_num 0    8.3
1    0.0
2    8.3
3    7.0
4    NaN
Name: Payload to GTO_num, dtype: float64
Después de reemplazar NaN:
Rocket Height_num 0    70.00
1    51.38
2    70.00
3    58.00
4    31.00
Name: Rocket Height_num, dtype: float64
Fairing Height_num 0    13.000000
1    15.590000
2    13.000000
3    12.400000
4    10.959038
Name: Fairing Height_num, dtype: float64
Payload to LEO_num 0    22.8
1     7.5
2    22.8
3    12.0
4     1.5
Name: Payload to LEO_num, dtype: float64
Payload to GTO_num 0    8.300000
1    7.260077
2    8.300000
3    7.000000
4    7.260077
Name: Payload to GTO_num, dtype: float64


Unnamed: 0,company,rocket_and_payload,date_and_location,details_url,Status,Price,Liftoff Thrust,Stages,Strap-ons,Rocket Height,Fairing Diameter,Fairing Height,LEO_kg,GTO_kg,Payload_total_kg,Volumen_total_m3
0,SpaceX,Falcon 9 Block 5 | Nusantara Lima,"Thu Sep 11, 2025 22:56 CLST\nSLC-40, Cape Cana...",https://nextspaceflight.com/launches/details/6907,Active,$69.75 million,"7,607 kN",2,0.0,70.0 m,5.2 m,13.0 m,22.8,8.3,31.1,0
1,Roscosmos,Soyuz 2.1a | Progress MS-32,"Thu Sep 11, 2025 12:54 CLST\nSite 31/6, Baikon...",https://nextspaceflight.com/launches/details/7710,Active,$17.42 million,"4,550 kN",3,4.0,51.38 m,4.11 m,15.59 m,7.5,7.260077,14.760077,0
2,SpaceX,Falcon 9 Block 5 | Tranche 1 Transport Layer B,"Wed Sep 10, 2025 11:12 CLST\nSLC-4E, Vandenber...",https://nextspaceflight.com/launches/details/7219,Active,$69.75 million,"7,607 kN",2,0.0,70.0 m,5.2 m,13.0 m,22.8,8.3,31.1,0
3,CASC,Long March 7A | Yaogan 45,"Mon Sep 8, 2025 23:00 CLST\nLC-201, Wenchang S...",https://nextspaceflight.com/launches/details/7943,Active,,"7,128 kN",3,4.0,58.0 m,4.2 m,12.4 m,12.0,7.0,19.0,0
4,Chinarocket,Jielong 3 | Geely Constellation Group 05,"Mon Sep 8, 2025 16:48 CLST\nOriental Spaceport...",https://nextspaceflight.com/launches/details/7942,Active,,,4,0.0,31.0 m,3.35 m,,1.5,7.260077,8.760077,0
5,SpaceX,Falcon 9 Block 5 | Starlink Group 17-9,"Sat Sep 6, 2025 2:06 PM CLT\nSLC-4E, Vandenber...",https://nextspaceflight.com/launches/details/7935,Active,$69.75 million,"7,607 kN",2,0.0,70.0 m,5.2 m,13.0 m,22.8,8.3,31.1,0
6,CASC,Long March 6A | Yaogan 40 Group 03,"Sat Sep 6, 2025 12:34 PM CLT\nLC-9A, Taiyuan S...",https://nextspaceflight.com/launches/details/7940,Active,,"7,230 kN",2,4.0,50.0 m,4.2 m,5.7 m,5.0,7.260077,12.260077,0
7,SpaceX,Falcon 9 Block 5 | Starlink Group 10-57,"Fri Sep 5, 2025 8:32 AM CLT\nLC-39A, Kennedy S...",https://nextspaceflight.com/launches/details/7934,Active,$69.75 million,"7,607 kN",2,0.0,70.0 m,5.2 m,13.0 m,22.8,8.3,31.1,0
8,Galactic Energy,Ceres 1 | 3 satellites,"Fri Sep 5, 2025 7:39 AM CLT\nSite 95A, Jiuquan...",https://nextspaceflight.com/launches/details/7939,Active,$4.38 million,588 kN,4,0.0,19.0 m,1.4 m,2.5 m,400.0,7.260077,407.260077,0
9,CASC,Long March 3C/YZ-1 | Shiyan 29,"Thu Sep 4, 2025 10:34 PM CLT\nLC-2, Xichang Sa...",https://nextspaceflight.com/launches/details/7938,Active,$20.0 million,"5,923 kN",4,2.0,55.64 m,4.2 m,9.78 m,7.5,3.5,11.0,0
