In [2]:
import pandas as pd
from astropy.coordinates import SkyCoord
from astropy import units as u
from astropy.coordinates import Angle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


KeyboardInterrupt



In [None]:
# read in the catalog of data
catalog_data = pd.read_csv('master_catalog_jan_2023.csv')

# convert RA and Dec values in catalog_data to SkyCoord object
catalog_coords = SkyCoord(catalog_data['RADEG'], catalog_data['DECDEG'], unit='deg')
# read in the file containing i and g values
ig_data = pd.read_csv('all_data.csv')


In [None]:

# convert RA and Dec values in ig_data to SkyCoord object
ig_coords = SkyCoord(Angle(ig_data['RA'], unit=u.hourangle), Angle(ig_data['Dec'], unit=u.deg))

In [None]:
# find the closest match for each object in the catalog data
idx, d2d, _ = catalog_coords.match_to_catalog_sky(ig_coords)

# add the i and g values to the catalog data
catalog_data['i'] = ig_data.iloc[idx]['i'].values
catalog_data['g'] = ig_data.iloc[idx]['g'].values

# extract features and target variable
X = catalog_data[['RADEG', 'DECDEG', 'i', 'g']]
y = catalog_data['CLASS']

# split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# train the random forest model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# make predictions on testing dataset
y_pred = rfc.predict(X_test)

# evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# CLEAN DATA

Remove spaces from table2.txt

In [None]:
import re

# Open the file for reading and writing
with open('observations.txt', 'r+') as f:
    # Read each line and replace multiple spaces with a single space
    lines = f.readlines()
    f.seek(0)  # move the file pointer to the beginning of the file
    f.truncate()  # clear the file content

    for line in lines:
        # Use regular expression to replace multiple spaces with a single space
        new_line = re.sub(' +', ' ', line)
        # Write the modified line to the file
        f.write(new_line)


Unzip catalog data

In [None]:
# import os
# import gzip
#
# # Define the directory path
# dir_path = 'CATALOGS/'
#
# # Loop over all file names in the directory
# for filename in os.listdir(dir_path):
#     # Check if the filename ends with '.gz'
#     if filename.endswith('.gz'):
#         # Open the gzip file for reading and the uncompressed file for writing
#         with gzip.open(os.path.join(dir_path, filename), 'rb') as f_in, \
#              open(os.path.join(dir_path, filename[:-3]), 'wb') as f_out:
#             # Copy the contents of the gzip file to the uncompressed file
#             f_out.write(f_in.read())
#
#         # Remove the gzip file
#         os.remove(os.path.join(dir_path, filename))

In [None]:
import os
import gzip
import concurrent.futures

# Define the directory path
dir_path = 'CATALOGS/'

def process_file(filename):
    # Check if the filename ends with '.gz'
    if filename.endswith('.gz'):
        # Open the gzip file for reading and the uncompressed file for writing
        with gzip.open(os.path.join(dir_path, filename), 'rb') as f_in, \
             open(os.path.join(dir_path, filename[:-3]), 'wb') as f_out:
            # Copy the contents of the gzip file to the uncompressed file
            f_out.write(f_in.read())

        # Remove the gzip file
        os.remove(os.path.join(dir_path, filename))

# Loop over all file names in the directory
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the tasks to the executor
    futures = [executor.submit(process_file, filename) for filename in os.listdir(dir_path)]

    # Wait for all tasks to complete
    concurrent.futures.wait(futures)


Remove leading white space and remove # also remove multiple spaces in a row

In [None]:
import os
import re
import shutil

# Define the directory path
dir_path = 'CATALOGS/'

# Loop over all file names in the directory
for filename in os.listdir(dir_path):
    # Check if the filename matches the pattern 'mXXXp.ascd'
    if int(filename[1:4]) >= 0:
        if re.match(r'm\d{3}p\.ascd', filename):
            # Open the file for reading and writing
            with open(os.path.join(dir_path, filename), 'r') as f_in, \
                 open(os.path.join(dir_path, filename + '.tmp'), 'w') as f_out:
                # Read each line and replace multiple spaces with a single space
                for line in f_in:
                    # Remove leading spaces and the first '#' character from each line
                    new_line = re.sub(r'^\s*#?\s*', '', line)
                    new_line = re.sub(' +', ' ', new_line)
                    # Write the modified line to the temporary file
                    f_out.write(new_line)

            # Replace the original file with the temporary file
            shutil.move(os.path.join(dir_path, filename + '.tmp'), os.path.join(dir_path, filename))

In [None]:
import os
import pandas as pd

# Define the directory path
dir_path = 'test_files/'

# Define the columns to remove
cols_to_remove = ['iccd', 'xg', 'yg', 'dg', 'ig', 'xi', 'yi', 'di', 'ii', 'ia', 'field']

# Loop over all file names in the directory
for filename in os.listdir(dir_path):
    # Check if the filename matches the pattern 'mXXXp.ascd'
    if filename.startswith('m') and filename.endswith('p.ascd'):
        # Load the file into a DataFrame
        df = pd.read_csv(os.path.join(dir_path, filename), delim_whitespace=True)

        # Remove the specified columns
        df = df.drop(columns=cols_to_remove)

        # Save the modified DataFrame to a new file
        new_filename = 'c' + filename
        df.to_csv(os.path.join(dir_path, new_filename), sep=' ', index=False)

In [8]:
import os
import pandas as pd

# Define the directory path
dir_path = 'CATALOGS/'

# Loop over all file names in the directory
for filename in os.listdir(dir_path):
    # Check if the filename matches the pattern 'mXXXp.ascd'
    if filename.startswith('m') and filename.endswith('p.ascd'):
        # Read the file into a DataFrame
        df = pd.read_csv(os.path.join(dir_path, filename), delim_whitespace=True)

        # Remove the rows where 'ig' or 'ii' are less than or equal to 0
        df = df[(df['ig'] != 0) & (df['ii'] != 0) & (df['g'] <= 23) & (df['i'] <= 23)]

        # Remove the columns 'iccd', 'xg', 'yg', 'dg', 'ig', 'xi', 'yi', 'di', 'ii', 'ia', and 'field'
        df.drop(columns=['iccd', 'xg', 'yg', 'dg', 'ig', 'xi', 'yi', 'di', 'ii', 'ia', 'field'], inplace=True)

        # Write the modified DataFrame to a new file with a 'c' prefix
        df.to_csv(os.path.join('very_clean/', 'c' + filename), index=False, sep=' ')
