In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#URL of the webpage containing the data
url = "https://exoplanetarchive.ipac.caltech.edu/data/ExoData/0000/0000522/data/UID_0000522_RVC_001.tbl"

#Send a GET request to the URL
response = requests.get(url)

In [3]:
#Check if the request was successful 
if response.status_code == 200:
    #Parse the HTML content using  BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    #Extract the data from the webpage
    data = soup.get_text()
    
    #Split the data into rows
    rows = data.split("\n")
    
    #Ignore the first 19 rows (Metadata)
    relevant_rows = rows[22:]
    
    #Create a DataFrame from the relevant data
    df = pd.DataFrame([row.split() for row in relevant_rows])
    
    #Save the DataFrame to a local CSV file
    df.to_csv("data.csv", index = False)
    print("Data saved successfully.")
    
else:
    print("Failed to retrieve data from the webpage")

Data saved successfully.


Now we have succesfully got a csv file in a semi-decent form, however we want to remove the first row comepletly, and do some parsing of a csv or txt file to make into a table form 

In [17]:
# Read the CSV file into a DataFrame, skipping the first row
df = pd.read_csv('data.csv', skiprows=1, header=None)

# Split the data in each row by commas and expand it into separate columns, while assigning column names
df = df[0].str.split(',', expand=True).rename(columns={0: 'Column1', 1: 'Column2', 2: 'Column3'})

# Drop any empty rows
df = df.dropna()

# Convert the columns to numeric types if needed
df['Column1'] = pd.to_numeric(df['Column1'])
df['Column2'] = pd.to_numeric(df['Column2'])
df['Column3'] = pd.to_numeric(df['Column3'])

# Reset the index
df = df.reset_index(drop=True)

# Optional: Save the structured data to a new CSV file
df.to_csv('structured_data.csv', index=False)


AttributeError: Can only use .str accessor with string values!

In [88]:
import pandas as pd
import numpy as np

# Read the CSV file into a DataFrame, skipping the first row
df = pd.read_csv('data.csv', skiprows=1, header=None)

# Check if the first column contains string values, if not, convert it to string
if not df[0].dtype == object:
    df[0] = df[0].astype(str)

# Split the data in each row by commas and expand it into separate columns, while assigning column names
df = df[0].str.split(',', n=3, expand=True)

# Filter out empty rows
df = df[~df.apply(lambda row: row.str.strip().eq('').all(), axis=1)]

# Filter out rows with the wrong number of columns
expected_num_columns = 3
df = df[df.apply(lambda row: len(row) == expected_num_columns, axis=1)]

# Ensure that all columns have numeric data type
df = df.apply(pd.to_numeric, errors='coerce')

# Reset the index
df = df.reset_index(drop=True)

# Optional: Save the structured data to a new CSV file
df.to_csv('structured_data.csv', index=False)


In [30]:
import csv
import pandas as pd

# Read the CSV file and store non-empty rows in a list
rows = []
with open('data.csv', 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        if any(row):
            rows.append(row)

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows)

# Ensure that the DataFrame has three columns
if df.shape[1] != 3:
    raise ValueError("Invalid number of columns in the DataFrame.")

# Rename the columns
df.columns = ['Column1', 'Column2', 'Column3']

# Convert the columns to numeric types if needed
df['Column1'] = pd.to_numeric(df['Column1'])
df['Column2'] = pd.to_numeric(df['Column2'])
df['Column3'] = pd.to_numeric(df['Column3'])

# Reset the index
df = df.reset_index(drop=True)

# Save the structured data to a new CSV file
df.to_csv('structured_data1.csv', index=False)


In [35]:
import csv
import pandas as pd

# Read the CSV file and store non-empty rows in a list
rows = []
with open('data.csv', 'r') as file:
    csv_reader = csv.reader(file)
    skip_first_row = True  # Flag to skip the first row
    for row in csv_reader:
        if skip_first_row:
            skip_first_row = False
            continue
        if any(row):
            rows.append(row)

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows)

# Ensure that the DataFrame has three columns
if df.shape[1] != 3:
    raise ValueError("Invalid number of columns in the DataFrame.")

# Rename the columns
df.columns = ['Column1', 'Column2', 'Column3']

# Convert the columns to numeric types if needed
df['Column1'] = pd.to_numeric(df['Column1'])
df['Column2'] = pd.to_numeric(df['Column2'])
df['Column3'] = pd.to_numeric(df['Column3'])

# Reset the index
df = df.reset_index(drop=True)

# Save the structured data to a new CSV file
df.to_csv('structured_data3.csv', index=False)


In [4]:
import csv
import pandas as pd

# Read the CSV file and store non-empty rows in a list
rows = []
with open('data.csv', 'r') as file:
    csv_reader = csv.reader(file)
    skip_first_row = True  # Flag to skip the first row
    for row in csv_reader:
        if skip_first_row:
            skip_first_row = False
            continue
        if any(row):
            rows.append(row)

# Convert the list of rows to a DataFrame
df = pd.DataFrame(rows)

# Ensure that the DataFrame has three columns
if df.shape[1] != 3:
    raise ValueError("Invalid number of columns in the DataFrame.")

# Rename the columns
df.columns = ['JD double days', 'RV double m/s', 'RV uncertainty double m/s']

# Convert the columns to numeric types if needed
df['JD double days'] = pd.to_numeric(df['JD double days'])
df['RV double m/s'] = pd.to_numeric(df['RV double m/s'])
df['RV uncertainty double m/s'] = pd.to_numeric(df['RV uncertainty double m/s'])

# Reset the index
df = df.reset_index(drop=True)

# Save the structured data to a new CSV file
df.to_csv('structured_data.csv', index=False)
