# Notebook to assist in development of extract_fits_headers

In [6]:
# First import required libraries
import os
import sys
import pandas as pd
from astropy.io import fits
#import a wrning suppression library to keep the notebook clean
import warnings
warnings.filterwarnings('ignore')


## Function: `extract_fits_headers`

### Description
The `extract_fits_headers` function is designed for systematic extraction of headers from FITS (Flexible Image Transport System) files located within a specified directory and its subdirectories. This function provides an efficient means to collate header information from multiple FITS files, facilitating data organization and analysis.

### Parameters
- `directory` (`str`): Path to the directory containing FITS files. The function will recursively search this directory and its subdirectories for FITS files.

### Returns
- `pandas.DataFrame`: A DataFrame object where each row corresponds to the header of a FITS file. The DataFrame includes all header information and an additional column specifying the file path of each FITS file.

### Functionality
1. **Recursive Directory Traversal**: The function traverses the specified directory and its subdirectories, identifying files with a `.fits` extension.
2. **Header Extraction and Aggregation**: For each identified FITS file, the function attempts to open the file and extract its header information. This data is then aggregated into a list.
3. **Error Handling**: If a file cannot be read, the function captures the exception, logs an error message indicating the problematic file, and continues processing the remaining files.
4. **Dataframe Creation**: The aggregated list of headers is converted into a pandas DataFrame, with each row representing a FITS file's header and its corresponding file path.

### Usage
This function is particularly useful in astronomical data analysis and research settings, where managing and analyzing headers of multiple FITS files is a common requirement. The output DataFrame serves as a consolidated database of header information, streamlining subsequent data processing and analysis tasks.


In [7]:
def extract_fits_headers(directory):
    """
    Extracts headers from all FITS files in a specified directory and its subdirectories.
    
    Parameters:
    directory (str): The path of the directory containing FITS files.
    
    Returns:
    pandas.DataFrame: A DataFrame where each row represents the header of a FITS file,
                      including the file path as an additional column.
    """
    headers = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.fits'):
                file_path = os.path.join(root, file)
                try:
                    with fits.open(file_path) as hdul:
                        header = hdul[0].header
                        headers.append(dict(header, file_path=file_path))
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    return pd.DataFrame(headers)

## Testing

Supply the path to directory containing mutliple FITS files. The test directory has no hierachy but contains Lights, Daarkd, Flats and Bias frames.

In [8]:
path = '/home/steve/Desktop/AstroData/NGC 7822'
#Call function
headers_df = extract_fits_headers(path)

In [9]:
#Display data frame containing header infromation
headers_df

Unnamed: 0,SIMPLE,BITPIX,NAXIS,NAXIS1,NAXIS2,BZERO,EXTEND,IMAGETYP,EXPOSURE,EXPTIME,...,PRESSURE,AMBTEMP,WINDDIR,WINDSPD,ROWORDER,EQUINOX,SWCREATE,file_path,DATE-AVG,DEWPOINT
0,True,16,2,9576,6388,32768,True,LIGHT,600.000000,600.000000,...,0.0,0.00,0.0,0.000,TOP-DOWN,2000.0,N.I.N.A. 2.3.0.2002,/home/steve/Desktop/AstroData/NGC 7822/PANEL_1...,,
1,True,16,2,9576,6388,32768,True,FLAT,0.490000,0.490000,...,,,,,TOP-DOWN,2000.0,N.I.N.A. 2.3.0.2001,/home/steve/Desktop/AstroData/NGC 7822/2023-08...,,
2,True,16,2,9576,6388,32768,True,FLAT,0.510000,0.510000,...,,,,,TOP-DOWN,2000.0,N.I.N.A. 2.3.0.2001,/home/steve/Desktop/AstroData/NGC 7822/2023-08...,,
3,True,16,2,9576,6388,32768,True,FLAT,0.150000,0.150000,...,,,,,TOP-DOWN,2000.0,N.I.N.A. 2.3.0.2001,/home/steve/Desktop/AstroData/NGC 7822/2023-08...,,
4,True,16,2,9576,6388,32768,True,FLAT,0.180000,0.180000,...,,,,,TOP-DOWN,2000.0,N.I.N.A. 2.3.0.2001,/home/steve/Desktop/AstroData/NGC 7822/2023-08...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,True,16,2,9576,6388,32768,True,LIGHT,60.000000,60.000000,...,1006.0,8.19,278.0,9.648,TOP-DOWN,2000.0,N.I.N.A. 2.3.1.9001 (x64),/home/steve/Desktop/AstroData/NGC 7822/PANEL_1...,2023-10-13T23:12:45.226,6.520504
1256,True,16,2,9576,6388,32768,True,LIGHT,60.000000,60.000000,...,1006.0,8.35,273.0,6.444,TOP-DOWN,2000.0,N.I.N.A. 2.3.1.9001 (x64),/home/steve/Desktop/AstroData/NGC 7822/PANEL_1...,2023-10-13T22:47:26.015,6.678304
1257,True,16,2,9576,6388,32768,True,DARK,600.000000,600.000000,...,,,,,TOP-DOWN,2000.0,N.I.N.A. 2.0.3.2005,/home/steve/Desktop/AstroData/NGC 7822/2022-11...,,
1258,True,16,2,9576,6388,32768,True,BIAS,0.000032,0.000032,...,,,,,TOP-DOWN,2000.0,N.I.N.A. 3.0.0.1016,/home/steve/Desktop/AstroData/NGC 7822/2023-04...,,


### Inspect individual dataframe entry

In [17]:
headers_df.iloc[0]

SIMPLE                                                    True
BITPIX                                                      16
NAXIS                                                        2
NAXIS1                                                    9576
NAXIS2                                                    6388
BZERO                                                    32768
EXTEND                                                    True
IMAGETYP                                                 LIGHT
EXPOSURE                                                 600.0
EXPTIME                                                  600.0
DATE-LOC                               2023-09-04T03:58:46.954
DATE-OBS                               2023-09-04T02:58:46.954
XBINNING                                                     1
YBINNING                                                     1
GAIN                                                       100
OFFSET                                                 

## Check dataframe information

In [18]:
headers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 59 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SIMPLE     1260 non-null   bool   
 1   BITPIX     1260 non-null   int64  
 2   NAXIS      1260 non-null   int64  
 3   NAXIS1     1260 non-null   int64  
 4   NAXIS2     1260 non-null   int64  
 5   BZERO      1260 non-null   int64  
 6   EXTEND     1260 non-null   bool   
 7   IMAGETYP   1260 non-null   object 
 8   EXPOSURE   1260 non-null   float64
 9   EXPTIME    1260 non-null   float64
 10  DATE-LOC   1260 non-null   object 
 11  DATE-OBS   1260 non-null   object 
 12  XBINNING   1260 non-null   int64  
 13  YBINNING   1260 non-null   int64  
 14  GAIN       1260 non-null   int64  
 15  OFFSET     1260 non-null   int64  
 16  EGAIN      1260 non-null   float64
 17  XPIXSZ     1260 non-null   float64
 18  YPIXSZ     1260 non-null   float64
 19  INSTRUME   1260 non-null   object 
 20  SET-TEMP

### All looks fine. Now test the function with a structured directory

In [21]:
path = '/mnt/HDD_8TB/Preselected/Flaming Star Nebula Mosaic started 30th January 2023'
#Call function
headers_df1 = extract_fits_headers(path)

### Check dataframe information

In [22]:
headers_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526 entries, 0 to 525
Data columns (total 58 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SIMPLE     526 non-null    bool   
 1   BITPIX     526 non-null    int64  
 2   NAXIS      526 non-null    int64  
 3   NAXIS1     526 non-null    int64  
 4   NAXIS2     526 non-null    int64  
 5   BZERO      526 non-null    int64  
 6   EXTEND     526 non-null    bool   
 7   IMAGETYP   526 non-null    object 
 8   EXPOSURE   526 non-null    float64
 9   EXPTIME    526 non-null    float64
 10  DATE-LOC   526 non-null    object 
 11  DATE-OBS   526 non-null    object 
 12  XBINNING   526 non-null    int64  
 13  YBINNING   526 non-null    int64  
 14  GAIN       526 non-null    int64  
 15  OFFSET     526 non-null    int64  
 16  EGAIN      526 non-null    float64
 17  XPIXSZ     526 non-null    float64
 18  YPIXSZ     526 non-null    float64
 19  INSTRUME   526 non-null    object 
 20  SET-TEMP  

### There looks to be one less column here. The column missing is DATE_AVE. The heaser information is generated by N.I.N.A and might have changed between these images being taken. It is not an issue.

### Inspect individual dataframe entry

In [37]:
headers_df1.iloc[0]

SIMPLE                                                    True
BITPIX                                                      16
NAXIS                                                        2
NAXIS1                                                    9576
NAXIS2                                                    6388
BZERO                                                    32768
EXTEND                                                    True
IMAGETYP                                                 LIGHT
EXPOSURE                                                 600.0
EXPTIME                                                  600.0
DATE-LOC                               2023-02-19T21:23:11.424
DATE-OBS                               2023-02-19T21:23:11.424
XBINNING                                                     1
YBINNING                                                     1
GAIN                                                       100
OFFSET                                                 

### Check last data frame entry

In [39]:
headers_df1.iloc[-1]

SIMPLE                                                    True
BITPIX                                                      16
NAXIS                                                        2
NAXIS1                                                    9576
NAXIS2                                                    6388
BZERO                                                    32768
EXTEND                                                    True
IMAGETYP                                                 LIGHT
EXPOSURE                                                 600.0
EXPTIME                                                  600.0
DATE-LOC                               2023-04-19T22:38:11.020
DATE-OBS                               2023-04-19T21:38:11.020
XBINNING                                                     1
YBINNING                                                     1
GAIN                                                       100
OFFSET                                                 

### All looks fine. The function will work with both hierachical and flat folders