In [None]:
# ------------------------------------------------------------------
# Step 0 – Purpose
# ------------------------------------------------------------------
# Inspect all files in a given folder to:
# - Identify file types
# - Attempt to load them with pandas
# - Print file name, shape, column names, and header preview

In [None]:
# ------------------------------------------------------------------
# Step 1 – Imports
# ------------------------------------------------------------------
import os
import pandas as pd

In [None]:
# ------------------------------------------------------------------
# Step 2 – Define Path and Inspection Function
# ------------------------------------------------------------------
data_dir = "/Users/rosstaylor/Downloads/Research Project/Code Folder/diagnostic-modality-demand/diagnostic-modality-demand/data/raw"

def inspect_file(filepath):
    print(f"\nFile: {os.path.basename(filepath)}")

    try:
        if filepath.endswith('.csv'):
            df = pd.read_csv(filepath, nrows=5)
        elif filepath.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(filepath, nrows=5)
        elif filepath.endswith('.parquet'):
            df = pd.read_parquet(filepath)
        else:
            print("Unsupported or non-tabular file type")
            return

        print(f"Type: {type(df).__name__}")
        print(f"Shape (first 5 rows): {df.shape}")
        print(f"Column names: {list(df.columns)}")
        print("Head preview:")
        print(df.head())

    except Exception as e:
        print(f"Error reading file: {e}")


In [None]:
# ------------------------------------------------------------------
# Step 3 – Load and Inspect Files
# ------------------------------------------------------------------
for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir, filename)
    if os.path.isfile(filepath):
        inspect_file(filepath)

In [None]:
# ------------------------------------------------------------------
# Step 4 – Load and Inspect Metadata
# ------------------------------------------------------------------
summary = []

for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir, filename)
    if os.path.isfile(filepath):
        file_info = {
            "filename": filename,
            "filetype": os.path.splitext(filename)[-1],
            "size_MB": round(os.path.getsize(filepath) / (1024 * 1024), 2)
        }
        try:
            if filename.endswith(".csv"):
                df = pd.read_csv(filepath, nrows=5)
            elif filename.endswith(('.xlsx', '.xls')):
                df = pd.read_excel(filepath, nrows=5)
            elif filename.endswith(".parquet"):
                df = pd.read_parquet(filepath)
            else:
                continue
            file_info["columns"] = list(df.columns)
            file_info["n_columns"] = len(df.columns)
        except Exception as e:
            file_info["error"] = str(e)
        summary.append(file_info)

# Convert to DataFrame and preview
summary_df = pd.DataFrame(summary)
print(summary_df.head())