# Meteor Project - Download & Parse the data

In this first Jupyter Notebook we are going to download, read in, parse and create dataframe with all meteor data we will work on in the next couple of weeks

## Download the meteor data

In [1]:
import os
import glob
import requests
import zipfile

import numpy as np
import pandas as pd
from tqdm import tqdm

def download_and_extract_zip(url):
    # Create a folder named .temp if it doesn't exist
    temp_folder = "temp"
    if not os.path.exists(temp_folder):
        os.makedirs(temp_folder)

    try:
        # Send a GET request to the URL to download the zip file
        response = requests.get(url)
        response.raise_for_status()

        # Extract the filename from the URL
        filename = os.path.join(temp_folder, os.path.basename(url))

        # Save the downloaded zip file to the temporary folder
        with open(filename, "wb") as file:
            file.write(response.content)

        # Extract the contents of the zip file to the temporary folder
        with zipfile.ZipFile(filename, "r") as zip_ref:
            zip_ref.extractall(temp_folder)

        os.remove(filename)
        
    except Exception as e:
        print(str(e))

# Prepare the download links and download all files
url_list_cams = \
    [f"https://ceres.ta3.sk/iaumdcdb/dataDBs/video_offline/iaumdcCAMSv3_201{n}.csv.zip"
     for n in range(7)]
     
url_list_sonotaco = \
    [f"https://ceres.ta3.sk/iaumdcdb/dataDBs/video_offline/iaumdcSNMv3_S{str(n).zfill(2)}.csv.zip"
     for n in np.arange(8, 23, 1)]

url_list = url_list_cams + url_list_sonotaco

for k in tqdm(url_list):
    download_and_extract_zip(k)

100%|███████████████████████████████████████████| 22/22 [00:14<00:00,  1.56it/s]


## Read in the data and clean up

In [2]:
# Get all csv files
meteor_data_filepaths = glob.glob("temp/*.csv")

In [3]:
# Read in all files and combine them in one single dataframe
def combine_csv_files(filepaths):
    # Initialize an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    for filepath in filepaths:
        try:
            # Read each CSV file and append it to the combined DataFrame
            df = pd.read_csv(filepath, delimiter=";")
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error reading file {filepath}: {str(e)}")

    return combined_df

df = combine_csv_files(meteor_data_filepaths)

In [4]:
# Danger zone: we want to check which columns contain NaN values. Now since Jupyter-Lab limits the
# output print, we are going to ... remove this limit. But note: if you want to display now a huge
# dataframe you are going to have a bad time
from IPython.display import display
print(f"Initial number of max columns: {pd.options.display.max_columns}")
print(f"Initial number of max rows: {pd.options.display.max_rows}")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Initial number of max columns: 20
Initial number of max rows: 60


In [5]:
# Check if any values in each column are NaN
any_nans = df.isna().any()
all_nans = df.isna().all()

# Create a new DataFrame to store the results
nan_df = pd.DataFrame({'Any NaN values': any_nans, 'All NaN values': all_nans})

print(nan_df)

# Link to all values: https://ceres.ta3.sk/iaumdcdb//public/docs/parametersdescription.txt

            Any NaN values  All NaN values
IID                  False           False
DB                   False           False
IC                   False           False
Ano                   True           False
Yr                   False           False
Mn                   False           False
Dayy                 False           False
delta_Dayy            True            True
LS                    True           False
delta_LS              True           False
mv                   False           False
delta_mv              True            True
HB                   False           False
delta_HB              True           False
HM                    True           False
delta_HM              True            True
HE                   False           False
delta_HE              True           False
RA                   False           False
delta_RA             False           False
DECL                 False           False
delta_DECL           False           False
Vi         

In [6]:
# Restore the "rows universe"
pd.set_option('display.max_rows', 10)

In [7]:
# We drop some columns we do not need or can re-compute manually
df.drop(columns=["IID",
                 "DB",
                 "IC",
                 "Ano",
                 "delta_Dayy",
                 "LS", "delta_LS",
                 "HB", "delta_HB",
                 "HM", "delta_HM",
                 "HE", "delta_HE",
                 "delta_RA",
                 "delta_DECL",
                 "Vi", "delta_Vi",
                 "delta_Vg",
                 "delta_Vh",
                 "delta_cZ",
                 "delta_mv",
                 "Qm",
                 "Qa",
                 "cZ",
                 "delta_q",
                 "delta_e",
                 "delta_a1",
                 "delta_a",
                 "delta_Qa",
                 "delta_i",
                 "delta_arg",
                 "delta_nod",
                 "delta_pi",
                 "sh",
                 "Mas", "delta_Mas",
                 "lgM", "delta_lgM",
                 "cor",
                 "crh",
                 "mr", "delta_mr",
                 "Hrf", "delta_Hrf",
                 "LpA", "delta_LpA",
                 "dur"],
       inplace=True)

In [8]:
# Our final dataframe:
df

Unnamed: 0,Yr,Mn,Dayy,mv,RA,DECL,Vg,Vh,q,e,a1,a,i,arg,nod,pi
0,2022,1,1.36141,1.3,65.589,38.787,12.424,37.891,0.889,0.6306,0.4155,2.407,5.7151,221.2093,280.6015,
1,2022,1,1.36365,1.9,313.728,-22.221,12.562,38.041,0.880,0.6458,0.4026,2.484,1.5999,317.0225,100.5960,
2,2022,1,1.37797,1.3,305.452,25.050,14.408,38.657,0.923,0.6775,0.3494,2.862,15.1098,148.0460,280.6173,
3,2022,1,1.39083,0.3,232.850,60.526,39.530,42.131,0.982,0.9676,0.0330,30.309,63.6963,184.5843,280.6299,
4,2022,1,1.39433,-0.1,93.278,24.747,19.045,37.882,0.685,0.7147,0.4162,2.402,0.7777,254.1440,280.6460,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895544,2012,12,31.87218,-0.4,188.337,-8.629,72.131,42.027,0.983,0.9577,0.0430,23.247,172.0427,357.7996,100.4206,
895545,2012,12,31.87269,-1.0,229.675,19.802,58.424,43.313,0.698,1.0564,-0.0807,-12.390,106.3832,115.8545,280.4203,
895546,2012,12,31.87308,-2.3,164.473,13.363,65.079,41.898,0.395,0.9782,0.0551,18.138,164.7169,282.1166,280.4203,
895547,2012,12,31.87690,0.4,175.746,12.941,69.996,43.410,0.715,1.0645,-0.0902,-11.083,160.3604,241.9048,280.4243,


In [9]:
# The longitude of perihelion (here: pi), contains NaN values ... around 50 % are not avaiable. But
# instead of dropping all rows are create 2 separate dataframes for the future. We still can use
# some parameters
df_orbit_compl = df[~df["pi"].isna()].copy()
df_orbit_error = df[df["pi"].isna()].copy()

In [10]:
# Print the resulting number of rows
print(len(df_orbit_compl))
print(len(df_orbit_error))

471582
423967


In [11]:
# Cross check if we still have NaN values in our "healthy" dataframe
print(any(df_orbit_compl.isna().all()))

False


## Store the data for future analysis

In [12]:
# Create a folder for the data
folder = "meteor_data"
if not os.path.exists(folder):
    os.makedirs(folder)
    
# Store the dataframes
df_orbit_compl.to_csv("meteor_data/meteor_compl.csv")
df_orbit_error.to_csv("meteor_data/meteor_error.csv")