# HELIX - Helium Exoplanet Literature Investigation Xtractor

## Searching the arxiv for helium papers

## First installing the arxiv package.

In [None]:
#installing arxiv
!pip install arxiv==1.3.0


## Before running the code, search http://exoplanet.eu/catalog/all_fields/ and download the csv file and save it as exoplanets.csv. This gives the updated parameters for all exoplanets.

## This code below searches for arxiv and finds the papers that are related to helium. It also removes papers that are not related to helium (example brown dwarf or white dwarf planets)Run this code for finding the paper and store it in a file called step_1_helium_5.csv.
## Do not forget to change the starting date.

In [4]:
#this code reads the arxiv files from the start date and updates the list and stores it in helium_4.csv

import arxiv
import datetime
import re
import csv

# Read exoplanet names from the CSV file
exoplanet_names = set()
with open('exoplanets.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        exoplanet_names.add(row['name'].strip())
        alternate_names = row['alternate_names'].split(',')
        for name in alternate_names:
            exoplanet_names.add(name.strip())

# Define the search query
query = 'cat:astro-ph.EP AND (ti:"helium" OR ti:"He" OR abs:"helium" OR abs:"He" OR ti:"1083" OR ti:"10833" OR abs:"1083" OR abs:"10833")'
start_date = datetime.date(2023, 7, 25) # searched until 2023 Nov 24

# Search for papers
search = arxiv.Search(
    query=query,
    sort_by=arxiv.SortCriterion.SubmittedDate
)
search_results = search.results()

# Filter search results by publication date and keywords
filtered_results = []
for paper in search_results:
    if paper.published.date() < start_date:
        continue
    if any(keyword in paper.title.lower() or keyword in paper.summary.lower() for keyword in ["brown dwarf", "white dwarf"]):
        continue
    filtered_results.append(paper)

# Write the results to a CSV file
with open('step_1_helium_5.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['arXiv ID', 'URL', 'Exoplanet Names Found'])

    for paper in filtered_results:
        arxiv_id = paper.entry_id.split('/')[-1]
        url = f'https://arxiv.org/abs/{arxiv_id}'

        # Find exoplanet names in the title and the abstract
        exoplanet_names_title = {name for name in exoplanet_names if name in paper.title or name.replace(" ", "") in paper.title}
        exoplanet_names_abstract = {name for name in exoplanet_names if name in paper.summary or name.replace(" ", "") in paper.summary}
        exoplanet_names_found = exoplanet_names_title | exoplanet_names_abstract

        # Write the result to the CSV file
        writer.writerow([arxiv_id, url, ', '.join(exoplanet_names_found)])


## The code above also has many false positives that are not helium studies. Hence manually have to check them and shortlist the exact helium observation planets in the files step_2_helium_shortlisted.csv. If you change the name, please change the name in the code below as well.

## The code below read the step_2_helium_shortlisted.csv file and arrange them in planets order (alphabetically) in the file step_3_helium_planets_update.csv

In [5]:
import csv

# read the data from the exoplanets file and store it in a dictionary
exoplanet_data = {}
with open('exoplanets.csv', 'r') as f:
    reader = csv.reader(f)
    # get the header row
    header = next(reader)
    # iterate over the rows in the file
    for row in reader:
        # use the planet name as the key in the dictionary
        planet_name = row[0]
        # get the headers and data for the planet
        planet_headers = header[2:]
        planet_data = row[2:]
        # store the data in a dictionary for the planet
        exoplanet_data[planet_name] = (planet_headers, planet_data)

# create a dictionary to store the planets and corresponding urls
planet_urls = {}

# read the data from the original file
with open('step_2_helium_shortlisted.csv', 'r') as f:
    reader = csv.reader(f)
    # skip the header row
    next(reader)
    # iterate over the rows in the file
    for row in reader:
        # split the planets by comma followed by a space
        planets = [p.strip() for p in row[1].split(',')]
        # iterate over the planets
        for planet in planets:
            # add the planet to the dictionary if it doesn't exist
            if planet not in planet_urls:
                planet_urls[planet] = []
            # add the url to the list of urls for the planet
            planet_urls[planet].append(row[0])

# create a new file to write the data to
with open('step_3_helium_planets_update.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    # get the headers from exoplanets.csv and add empty columns for columns 3-8
    planet_headers = ['Planets', 'URL', 'excess_abs', 'excess_abs_min', 'excess_abs_max', 'upp_lim', 'mass_loss', 'mass_loss_min', 'mass_loss_max','mass_loss_upp'] + exoplanet_data[next(iter(exoplanet_data.keys()))][0]
    # write the header row
    writer.writerow(planet_headers)
    # iterate over the planets in alphabetical order
    for planet in sorted(planet_urls.keys()):
        # get the data for the planet from the exoplanet data dictionary
        if planet in exoplanet_data:
            # get the headers and data for the planet
            planet_headers, planet_data = exoplanet_data[planet]
            # create a list of the data for the planet in the correct order
            planet_row = [planet, ', '.join(planet_urls[planet]), '', '', '', '', '', '', '', ''] + planet_data
            # write the row for the planet
            writer.writerow(planet_row)
        else:
            # write a row with the planet and url, leaving the exoplanet data columns empty
            writer.writerow([planet, ', '.join(planet_urls[planet]), '', '', '', '', '', '', '', '', '', '', '', '', '', ''])


## After this add the excess abs, excess abs min and max limits or upper limits in the step_3_helium_planets_for_plots.csv file manually . This is a crucial step. And manually copy these to new row in helium_planets_for_plots.csv or if the planets already exist, manually change the specific columns.

## We will use helium_planets_for_plots.csv to create a different csv file to use for calculations. And we will store the file as helium_planets.csv

In [None]:
import pandas as pd

# Read the original CSV file
input_file = 'helium_planets_for_plots.csv'
df = pd.read_csv(input_file)

# Select the first 10 columns
selected_columns = df.iloc[:, :10]

# Write the selected columns to a new CSV file
output_file = 'helium_planets.csv'
selected_columns.to_csv(output_file, index=False)