In [31]:
import os
import glob
import pathlib
import re
import pandas as pd

In [32]:
core_path = os.getcwd()
spectra_filepaths = sorted(glob.glob(os.path.join(core_path, "data/lvl0/", "smass2/*spfit*")))  #we are finding all the files inside the smass2 whose name contains "spfit"

In [33]:
spectra_filepaths[:8]

['e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000001.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000002.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000003.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000004.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000005.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000006.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000007.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\a000010.spfit.[2]']

In [34]:
spectra_filepaths[-8:]

['e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1995BM2.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1995WQ5.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1996PW.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1996UK.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1996VC.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1997CZ5.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1997RD1.spfit.[2]',
 'e:\\Projects\\Asteroid_Spectra_Project\\data/lvl0/smass2\\au1998WS.spfit.[2]']

In [35]:
des_file_paths = spectra_filepaths[:-8] # taking all except last 8 for designated
non_file_paths = spectra_filepaths[-8:] # taking last 8 except all for non designated

des_file_paths_df = pd.DataFrame(des_file_paths, columns=["FilePath"])  # converting it into a dataframe with File Paths as a column
non_file_paths_df = pd.DataFrame(non_file_paths, columns=["FilePath"])

# des_file_paths_df.head()

print(des_file_paths_df.FilePath.iloc[0])
print(non_file_paths_df.FilePath.iloc[0])

e:\Projects\Asteroid_Spectra_Project\data/lvl0/smass2\a000001.spfit.[2]
e:\Projects\Asteroid_Spectra_Project\data/lvl0/smass2\au1995BM2.spfit.[2]


In [36]:
# for all rows, set the value in the colum DesNr to the result of the lambda function.
des_file_paths_df.loc[:, "DesNr"] = des_file_paths_df["FilePath"].apply(lambda x: int(re.search(r"smass2\\a(.*?)\.spfit", x).group(1))) #we are extracting the number and storing it in DesNr
non_file_paths_df.loc[:, "DesNr"] = non_file_paths_df["FilePath"].apply(lambda x: re.search(r"smass2\\au(.*?)\.spfit", x).group(1))

In [37]:
non_file_paths_df.head()

Unnamed: 0,FilePath,DesNr
0,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,1995BM2
1,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,1995WQ5
2,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,1996PW
3,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,1996UK
4,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,1996VC


In [38]:
des_file_paths_df.head()

Unnamed: 0,FilePath,DesNr
0,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,1
1,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,2
2,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,3
3,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,4
4,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,5


In [39]:
# Reading the classification files

asteroid_class_df = pd.read_csv(os.path.join(core_path, "data/lvl0/", "Bus.Taxonomy.txt"),
                                skiprows=21,     # skipping 21 lines from the file (worthless)
                                sep="\t",        # seperate on tab space
                                names=["Name",
                                       "Tholen Class",
                                       "Bus Class",
                                       "unknown1",
                                       "unknown2"] # Unknown fields were added because 3 fields gives tokenization errors, where it counts 5. 
                                )

asteroid_class_df.head(5)
# asteroid_class_df[-8:]

Unnamed: 0,Name,Tholen Class,Bus Class,unknown1,unknown2
0,1 Ceres,G,C,,
1,2 Pallas,B,B,,
2,3 Juno,S,Sk,,
3,4 Vesta,V,V,,
4,5 Astraea,S,S,,


In [40]:
# removing the white spaces

asteroid_class_df.loc[:, "Name"] = asteroid_class_df["Name"].apply(lambda x: x.strip()).copy()

# asteroid_class_df.head(5)
asteroid_class_df[-8:]

Unnamed: 0,Name,Tholen Class,Bus Class,unknown1,unknown2
1439,1998 VR,,Sk,,
1440,1998 VO33,,V,,
1441,1998 WM,,Sq,,
1442,1998 WS,,Sr,,
1443,1998 WZ6,,V,,
1444,1999 EE5,,S,,
1445,1999 FA,,S,,
1446,1999 FB,,Q,,


In [41]:
# seperate between designated and non-designated asteroid classes

des_ast_class_df = asteroid_class_df[:1403].copy()

non_ast_class_df = asteroid_class_df[1403:].copy()


In [42]:
# now we are splitting the designated names and getting the designated numbers (to link with spfit files)
des_ast_class_df.loc[:, "DesNr"] = des_ast_class_df["Name"].apply(lambda x: int(x.split(" ")[0]))
# des_ast_class_df[-8:]


# Merge with the spectral file paths

des_ast_class_join_df = des_ast_class_df.merge(des_file_paths_df, on="DesNr")
# des_ast_class_join_df.head()

# Merging the non-designated names, we need to remove the white space between number and the name and then compare with file paths.

non_ast_class_df.loc[:, "DesNr"] = non_ast_class_df["Name"].apply(lambda x: x.replace(" ", ""))
# non_ast_class_df.head()

# Merge with spectral file paths

non_ast_class_join_df = non_ast_class_df.merge(non_file_paths_df, on="DesNr")
non_ast_class_join_df.head()


Unnamed: 0,Name,Tholen Class,Bus Class,unknown1,unknown2,DesNr,FilePath
0,1995 BM2,,Sq,,,1995BM2,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
1,1995 WQ5,,Ch,,,1995WQ5,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
2,1996 PW,,Ld,,,1996PW,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
3,1996 UK,,Sq,,,1996UK,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
4,1996 VC,,S,,,1996VC,e:\Projects\Asteroid_Spectra_Project\data/lvl0...


In [43]:
# Merging both datasets that we created now
asteroids_df = pd.concat([des_ast_class_join_df, non_ast_class_join_df], axis=0)

asteroids_df.reset_index(drop=True, inplace=True)
asteroids_df.drop(columns=["Tholen Class", "unknown1", "unknown2"], inplace=True)
asteroids_df.dropna(subset=["Bus Class"], inplace=True)
asteroids_df.head()

Unnamed: 0,Name,Bus Class,DesNr,FilePath
0,1 Ceres,C,1,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
1,2 Pallas,B,2,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
2,3 Juno,Sk,3,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
3,4 Vesta,V,4,e:\Projects\Asteroid_Spectra_Project\data/lvl0...
4,5 Astraea,S,5,e:\Projects\Asteroid_Spectra_Project\data/lvl0...


In [44]:
# Read and Store the spectra into a dataframe

asteroids_df.loc[:, "SpectrumDF"] = asteroids_df["FilePath"].apply(lambda x: pd.read_csv(x, sep="\t",
                                                                                         names=["Wavelength_in_micron",
                                                                                                "Reflectance_norm550nm"]))

asteroids_df.reset_index(drop=True, inplace=True)

asteroids_df.loc[:, "DesNr"] = asteroids_df["DesNr"].astype(str)

In [45]:
asteroids_df.head()

Unnamed: 0,Name,Bus Class,DesNr,FilePath,SpectrumDF
0,1 Ceres,C,1,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,Wavelength_in_micron Reflectance_norm550n...
1,2 Pallas,B,2,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,Wavelength_in_micron Reflectance_norm550n...
2,3 Juno,Sk,3,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,Wavelength_in_micron Reflectance_norm550n...
3,4 Vesta,V,4,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,Wavelength_in_micron Reflectance_norm550n...
4,5 Astraea,S,5,e:\Projects\Asteroid_Spectra_Project\data/lvl0...,Wavelength_in_micron Reflectance_norm550n...


In [46]:
pathlib.Path(os.path.join(core_path, "data/lvl1")).mkdir(parents=True, exist_ok=True)

asteroids_df.to_pickle(os.path.join(core_path, "data/lvl1/", "asteroids_merged.pk1"), protocol=4)