In [None]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

def quicklook_data(file_paths):
    """Reads and performs quicklook analysis on data files and saves plots to multipage PDF."""

    for file_path in file_paths:
        try:
            df = pd.read_csv(file_path, na_values=['-9999', '-9999.0', '-9999.00', '-9999.000', '-9999.0000'])

            # Convert TIMESTAMP_START and TIMESTAMP_END to datetime objects
            df['TIMESTAMP_START'] = pd.to_datetime(df['TIMESTAMP_START'], format='%Y%m%d%H%M')
            df['TIMESTAMP_END'] = pd.to_datetime(df['TIMESTAMP_END'], format='%Y%m%d%H%M')

            # Set TIMESTAMP_START as the index for time series plots
            df.set_index('TIMESTAMP_START', inplace=True)

            # Remove TIMESTAMP_END from dataframe
            df = df.drop(columns=['TIMESTAMP_END'])

            # Plotting and saving to PDF
            plot_quicklook_pdf(df, os.path.basename(file_path), os.path.dirname(file_path))

        except FileNotFoundError:
            print(f"File not found: {file_path}")
        except Exception as e:
            print(f"An error occurred while processing {file_path}: {e}")

def plot_quicklook_pdf(df, file_name, save_dir):
    """Plots quicklook analysis for a DataFrame and saves plots to a multipage PDF."""

    base_name = os.path.splitext(file_name)[0]  # Remove .csv extension
    num_cols = len(df.columns)
    num_rows = (num_cols + 2) // 3  # Calculate rows for 3 columns

    pdf_path = os.path.join(save_dir, f'{base_name}_quicklook.pdf')
    with PdfPages(pdf_path) as pdf:
        # Time series plots in a 3-column panel
        fig = plt.figure(figsize=(15, 2 * num_rows))
        for i, col in enumerate(df.columns):
            plt.subplot(num_rows, 3, i + 1)
            plt.plot(df.index, df[col])
            plt.title(f'{col}')
            plt.xlabel('Time')
            plt.ylabel(col)
            plt.grid(True)
        plt.tight_layout()
        pdf.savefig(fig)  # Save figure to PDF
        plt.close(fig)

        # Histograms in a 3-column panel
        fig = plt.figure(figsize=(18, 5 * num_rows))
        for i, col in enumerate(df.columns):
            plt.subplot(num_rows, 3, i + 1)
            sns.histplot(df[col], kde=True)
            plt.title(f'{col}')
            plt.xlabel(col)
            plt.ylabel('Frequency')
            plt.grid(True)
        plt.tight_layout()
        pdf.savefig(fig)
        plt.close(fig)

        # Pair plots (for a subset of variables to avoid excessive computation)
        if len(df.columns) >= 5:
            pair_vars = df.iloc[:, :5]  # First 5 columns
            fig = sns.pairplot(pair_vars).fig
            fig.suptitle(f'Pair Plot - {file_name}', y=1.02)
            pdf.savefig(fig)
            plt.close(fig)
        elif len(df.columns) > 1:
            fig = sns.pairplot(df).fig
            fig.suptitle(f'Pair Plot - {file_name}', y=1.02)
            pdf.savefig(fig)
            plt.close(fig)

# Example usage:
file_pattern = '/Users/bhupendra/projects/crocus/data/flux_data/data/AmeriFlux/US-CU1_HH_*.csv'
file_paths = sorted(glob.glob(file_pattern))
quicklook_data(file_paths)

In [None]:
# 