In [25]:
import glob # Lets you use glob searching for directories and files
import os
import pathlib
import re # Regular expression library

import pandas as pd

In [26]:
# Grabs all the spfit files from the zipped asteroid data file. 
# Spfit files contain the normalized reflectance values at every 10 nanometers of wavelength
core_path = "/Users/rishabhranjan/GoogleDrive/Professional/Projects/asteroid-spectra"
spectra_filepaths = sorted(glob.glob("data/lvl0/smass2/*spfit*")) 

In [27]:
# Adds the filepaths to a pandas dataframe
des_file_paths = spectra_filepaths[:-8]
non_file_paths = spectra_filepaths[-8:] # The last 8 spfit entries do not have designated asteroid numbers, and are treated differently

des_file_paths_df = pd.DataFrame(des_file_paths, columns=["FilePath"])
non_file_paths_df = pd.DataFrame(non_file_paths, columns=["FilePath"])

des_file_paths_df.loc[:, "DesNr"] = des_file_paths_df["FilePath"].apply(lambda x: int(re.search(r"smass2/a(.*).spfit", x).group(1)))
non_file_paths_df.loc[:, "DesNr"] = non_file_paths_df["FilePath"].apply(lambda y: re.search(r"smass2/au(.*).spfit", y).group(1))
print(des_file_paths_df)

                                FilePath  DesNr
0     data/lvl0/smass2/a000001.spfit.[2]      1
1     data/lvl0/smass2/a000002.spfit.[2]      2
2     data/lvl0/smass2/a000003.spfit.[2]      3
3     data/lvl0/smass2/a000004.spfit.[2]      4
4     data/lvl0/smass2/a000005.spfit.[2]      5
...                                  ...    ...
1328  data/lvl0/smass2/a011785.spfit.[2]  11785
1329  data/lvl0/smass2/a011906.spfit.[2]  11906
1330  data/lvl0/smass2/a012281.spfit.[2]  12281
1331  data/lvl0/smass2/a017480.spfit.[2]  17480
1332  data/lvl0/smass2/a018514.spfit.[2]  18514

[1333 rows x 2 columns]


In [28]:
# Adds the asteroid classification to a pandas dataframe
asteroid_class_df = pd.read_csv\
    ("data/lvl0/Bus.Taxonomy.txt", skiprows=21, sep="\t", names=["Name", "TholenClass", "Bus_Class", "unknown1", "unknown2"])
asteroid_class_df["Name"] = asteroid_class_df["Name"].apply(lambda x: x.strip()) # Remove leading and trailing spaces.

des_ast_class_df = asteroid_class_df[:1403].copy()
non_ast_class_df = asteroid_class_df[1403:].copy()
print(des_ast_class_df)

                 Name TholenClass Bus_Class unknown1 unknown2
0             1 Ceres           G         C      NaN      NaN
1            2 Pallas           B         B      NaN      NaN
2              3 Juno           S        Sk      NaN      NaN
3             4 Vesta           V         V      NaN      NaN
4           5 Astraea           S         S      NaN      NaN
...               ...         ...       ...      ...      ...
1398  17480 1991 PE10         NaN         S      NaN      NaN
1399    17511 1992 QN         NaN         X      NaN      NaN
1400  18514 1996 TE11         NaN        Xc      NaN      NaN
1401   19356 1997 GH3         NaN         S      NaN      NaN
1402   20255 1998 FX2         NaN        Sq      NaN      NaN

[1403 rows x 5 columns]


In [29]:
# Join the asteroid class and filepath dataframes 
# First grab the destination number from the name, which is what we will be merging on
des_ast_class_df.loc[:, "DesNr"] = des_ast_class_df["Name"].apply(lambda x: int(x.split(" ")[0]))
des_ast_class_join_df = des_ast_class_df.merge(des_file_paths_df, on="DesNr")
# print(des_ast_class_join_df)

non_ast_class_df.loc[:, "DesNr"] = non_ast_class_df["Name"].apply(lambda x: x.replace(" ", ""))
non_ast_class_join_df = non_ast_class_df.merge(non_file_paths_df, on="DesNr")
print(non_ast_class_join_df)

       Name TholenClass Bus_Class unknown1 unknown2    DesNr  \
0  1995 BM2         NaN        Sq      NaN      NaN  1995BM2   
1  1995 WQ5         NaN        Ch      NaN      NaN  1995WQ5   
2   1996 PW         NaN        Ld      NaN      NaN   1996PW   
3   1996 UK         NaN        Sq      NaN      NaN   1996UK   
4   1996 VC         NaN         S      NaN      NaN   1996VC   
5  1997 CZ5         NaN         S      NaN      NaN  1997CZ5   
6  1997 RD1         NaN        Sq      NaN      NaN  1997RD1   
7   1998 WS         NaN        Sr      NaN      NaN   1998WS   

                               FilePath  
0  data/lvl0/smass2/au1995BM2.spfit.[2]  
1  data/lvl0/smass2/au1995WQ5.spfit.[2]  
2   data/lvl0/smass2/au1996PW.spfit.[2]  
3   data/lvl0/smass2/au1996UK.spfit.[2]  
4   data/lvl0/smass2/au1996VC.spfit.[2]  
5  data/lvl0/smass2/au1997CZ5.spfit.[2]  
6  data/lvl0/smass2/au1997RD1.spfit.[2]  
7   data/lvl0/smass2/au1998WS.spfit.[2]  


In [30]:
# Now that we have each asteroid class alongisde its spectra filename, we can join and filter them into one dataframe
asteroids_df = pd.concat([des_ast_class_join_df, non_ast_class_join_df], axis=0)
print(asteroids_df["FilePath"])
asteroids_df.reset_index(drop=True, inplace=True)

asteroids_df.drop(labels=["TholenClass", "unknown1", "unknown2"], axis=1, inplace=True) # Drop columns that don't contain meaningful values
asteroids_df.dropna(subset="Bus_Class", inplace=True) # This did remove something!
print(asteroids_df["FilePath"])

0      data/lvl0/smass2/a000001.spfit.[2]
1      data/lvl0/smass2/a000002.spfit.[2]
2      data/lvl0/smass2/a000003.spfit.[2]
3      data/lvl0/smass2/a000004.spfit.[2]
4      data/lvl0/smass2/a000005.spfit.[2]
                     ...                 
3     data/lvl0/smass2/au1996UK.spfit.[2]
4     data/lvl0/smass2/au1996VC.spfit.[2]
5    data/lvl0/smass2/au1997CZ5.spfit.[2]
6    data/lvl0/smass2/au1997RD1.spfit.[2]
7     data/lvl0/smass2/au1998WS.spfit.[2]
Name: FilePath, Length: 1341, dtype: object
0         data/lvl0/smass2/a000001.spfit.[2]
1         data/lvl0/smass2/a000002.spfit.[2]
2         data/lvl0/smass2/a000003.spfit.[2]
3         data/lvl0/smass2/a000004.spfit.[2]
4         data/lvl0/smass2/a000005.spfit.[2]
                        ...                 
1336     data/lvl0/smass2/au1996UK.spfit.[2]
1337     data/lvl0/smass2/au1996VC.spfit.[2]
1338    data/lvl0/smass2/au1997CZ5.spfit.[2]
1339    data/lvl0/smass2/au1997RD1.spfit.[2]
1340     data/lvl0/smass2/au1998WS.spfit.[2]

In [31]:
# Read out the spectrum data into another dataframe, which can be added as a column
asteroids_df.loc[:, "SpectrumDF"] = asteroids_df["FilePath"].apply\
                                    (lambda x: pd.read_csv(x, sep="\t", 
                                    names=["Wavelength_in_micron", "Reflectance_norm550nm"]))
asteroids_df["DesNr"] = asteroids_df["DesNr"].astype(str)
print(asteroids_df["SpectrumDF"].iloc[0])


    Wavelength_in_micron  Reflectance_norm550nm
0                   0.44                 0.9281
1                   0.45                 0.9388
2                   0.46                 0.9488
3                   0.47                 0.9572
4                   0.48                 0.9643
5                   0.49                 0.9716
6                   0.50                 0.9788
7                   0.51                 0.9859
8                   0.52                 0.9923
9                   0.53                 0.9955
10                  0.54                 0.9969
11                  0.55                 1.0000
12                  0.56                 1.0040
13                  0.57                 1.0056
14                  0.58                 1.0037
15                  0.59                 1.0036
16                  0.60                 1.0044
17                  0.61                 1.0071
18                  0.62                 1.0107
19                  0.63                

In [32]:
# We can save this dataframe by pickling it into our data folder 
asteroids_df.to_pickle("data/lvl1/asteroids_merged.pkl")