In [None]:
## OpenFDA Drug Event - Interactive EDA Dashboard (SAMPLE VERSION)

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, HTML

# --- Configuration ---
COMBINED_DATA_DIR = "../../data_SAMPLE/openFDA_drug_event/combined_data/"

print(f"Loading data from: {COMBINED_DATA_DIR}")

# --- Load Combined Data ---
all_dataframes = {}
available_df_names = []

if os.path.exists(COMBINED_DATA_DIR):
    combined_files = glob.glob(os.path.join(COMBINED_DATA_DIR, "combined_*.csv.gz"))
    for f_path in combined_files:
        try:
            df_name = os.path.basename(f_path).replace("combined_", "").replace(".csv.gz", "")
            print(f"Loading {df_name}...")
            # Ensure safetyreportid is read as string
            df = pd.read_csv(f_path, compression='gzip', low_memory=False, dtype={'safetyreportid': str})
            all_dataframes[df_name] = df
            available_df_names.append(df_name)
            print(f"...loaded {df_name} with shape {df.shape}")
        except Exception as e:
            print(f"Error loading {f_path}: {e}")
else:
    print(f"Error: Combined data directory not found at {COMBINED_DATA_DIR}")

if not available_df_names:
    print("No combined dataframes were loaded. Please run the consolidation script first.")
else:
    print(f"\nSuccessfully loaded: {', '.join(available_df_names)}")

Loading data from: ../../data_SAMPLE/openFDA_drug_event/combined_data/
Loading patient...
...loaded patient with shape (174032, 11)
Loading patient_reaction...
...loaded patient_reaction with shape (497391, 6)
Loading patient_drug_openfda_rxcui...
...loaded patient_drug_openfda_rxcui with shape (3477111, 5)
Loading report...
...loaded report with shape (174032, 34)
Loading meta...
...loaded meta with shape (15, 8)
Loading patient_drug_openfda...
...loaded patient_drug_openfda with shape (61743398, 5)
Loading patient_drug...
...loaded patient_drug with shape (506105, 29)

Successfully loaded: patient, patient_reaction, patient_drug_openfda_rxcui, report, meta, patient_drug_openfda, patient_drug


In [None]:
## DataFrame Explorer

In [4]:
import os
import glob
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, HTML
from types import SimpleNamespace #

# --- DataFrame Styling Function ---
def style_dataframe(df, num_rows=5):
    """Applies basic styling to a Pandas DataFrame for better display in Jupyter."""
    if df is None or df.empty:
        return "DataFrame is empty or not available."
    return df.head(num_rows).style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('font-weight', 'bold')]}]
    ).set_properties(**{'border': '1px solid black', 'width': 'auto', 'text-align': 'left'}) \
    .format(None, na_rep="NA")

# --- EDA Functions for Display ---
def display_df_info(df):
    if df is None or df.empty:
        return "DataFrame is empty."
    import io
    buffer = io.StringIO()
    df.info(buf=buffer)
    return HTML(f"<pre>{buffer.getvalue()}</pre>")

def display_df_describe(df):
    if df is None or df.empty:
        return "DataFrame is empty."
    return df.describe(include='all').transpose().style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('font-weight', 'bold')]}]
    ).set_properties(**{'border': '1px solid black', 'width': 'auto', 'text-align': 'left'}) \
    .format(None, na_rep="NA")

def display_missing_values(df):
    if df is None or df.empty:
        return "DataFrame is empty."
    missing_percentage = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
    missing_df = missing_percentage[missing_percentage > 0].to_frame(name='Missing (%)')
    if missing_df.empty:
        return "No missing values found."
    return missing_df.style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('font-weight', 'bold')]}]
    ).format("{:.2f}%", na_rep="NA").set_properties(**{'border': '1px solid black', 'width': 'auto'})

def display_value_counts(df, column, top_n=10):
    if df is None or df.empty:
        return "DataFrame is empty."
    if column not in df.columns:
        return f"Column '{column}' not found."
    counts = df[column].value_counts(dropna=False)
    counts_df = counts.head(top_n).to_frame(name='Count')
    styled_counts = counts_df.style.set_table_styles(
        [{'selector': 'th', 'props': [('background-color', '#f2f2f2'), ('font-weight', 'bold')]}]
    ).set_properties(**{'border': '1px solid black', 'width': 'auto'})
    return styled_counts

# --- Widgets ---
if not available_df_names:
    print("Cannot create widgets as no dataframes were loaded.")
else:
    df_selector = widgets.Dropdown(
        options=available_df_names,
        value=available_df_names[0] if available_df_names else None,
        description='Select DataFrame:',
        disabled=False,
    )
    
    column_selector = widgets.Dropdown(
        description='Select Column (for Value Counts):',
        disabled=True 
    )
    
    rows_to_display_slider = widgets.IntSlider(
        value=5, min=1, max=50, step=1, description='Rows in Head:'
    )
    
    output_head = widgets.Output()
    output_info = widgets.Output()
    output_describe = widgets.Output()
    output_missing = widgets.Output()
    output_value_counts = widgets.Output()

    # Helper functions to safely access change event attributes
    def get_change_value(change_event):
        """Helper to get value from actual widget event or mocked SimpleNamespace."""
        return change_event.new

    def get_change_owner(change_event):
        """Helper to get owner from actual widget event or mocked SimpleNamespace."""
        return change_event.owner
    
    def update_column_selector(selected_df_name):
        if selected_df_name in all_dataframes:
            df = all_dataframes[selected_df_name]
            column_selector.options = sorted(list(df.columns))
            column_selector.disabled = False
            if df.columns.any():
                 column_selector.value = df.columns[0] # Default to first column
            else:
                 column_selector.value = None # No columns to select
        else:
            column_selector.options = []
            column_selector.disabled = True
            column_selector.value = None
            
    def on_df_selection_change(change):
        selected_df_name = get_change_value(change)
        df = all_dataframes.get(selected_df_name)
        
        # Update column selector first, as its value might be needed
        update_column_selector(selected_df_name)
        
        # Clear previous outputs
        output_head.clear_output(wait=True)
        output_info.clear_output(wait=True)
        output_describe.clear_output(wait=True)
        output_missing.clear_output(wait=True)
        output_value_counts.clear_output(wait=True) # Clear value counts as well
        
        # Trigger value count update for the new dataframe and its (newly set) default column
        if column_selector.value is not None: # Check if a column is selected
            mock_column_change = SimpleNamespace(new=column_selector.value, owner=column_selector, name='value', type='change')
            on_column_or_slider_change(mock_column_change)
        
        with output_head:
            display(style_dataframe(df, rows_to_display_slider.value))
        with output_info:
            display(display_df_info(df))
        with output_describe:
            display(display_df_describe(df))
        with output_missing:
            display(display_missing_values(df))
            
    def on_column_or_slider_change(change):
        df_name = df_selector.value
        df = all_dataframes.get(df_name)
        owner = get_change_owner(change)
        new_value = get_change_value(change)
        
        if owner == rows_to_display_slider:
            output_head.clear_output(wait=True)
            with output_head:
                display(style_dataframe(df, new_value))
        
        if owner == column_selector:
            selected_column = new_value
            output_value_counts.clear_output(wait=True)
            with output_value_counts:
                if df is not None and selected_column is not None and selected_column in df.columns:
                    styled_vc_table = display_value_counts(df, selected_column)
                    display(styled_vc_table)
                    counts = df[selected_column].value_counts(dropna=False)
                    if len(counts) > 10: # top_n is 10 in display_value_counts
                        display(HTML(f"<p><i>... and {len(counts) - 10} more unique values for '{selected_column}'.</i></p>"))
                elif df is not None and selected_column is not None:
                    display(HTML(f"<p>Column '{selected_column}' not found in DataFrame '{df_name}'.</p>"))
                elif df is None:
                    display(HTML("<p>DataFrame not available.</p>"))
                # If selected_column is None (e.g. df has no columns), do nothing or show a message
                elif selected_column is None:
                    display(HTML(f"<p>No column selected for DataFrame '{df_name}'.</p>"))

    # Attach observers
    df_selector.observe(on_df_selection_change, names='value')
    column_selector.observe(on_column_or_slider_change, names='value')
    rows_to_display_slider.observe(on_column_or_slider_change, names='value')
    
    # Initial population of column selector and trigger display for default selection
    if available_df_names:
        # update_column_selector(df_selector.value) # This is now called at the start of on_df_selection_change
        # Trigger initial display for the default DataFrame
        mock_df_change_event = SimpleNamespace(new=df_selector.value, owner=df_selector, name='value', type='change')
        on_df_selection_change(mock_df_change_event)
        
    # Display widgets
    controls = widgets.VBox([df_selector, column_selector, rows_to_display_slider])
    
    tab_children = [
        output_head, 
        output_info, 
        output_describe, 
        output_missing, 
        output_value_counts
    ]
    tab = widgets.Tab()
    tab.children = tab_children
    tab.set_title(0, 'Head')
    tab.set_title(1, 'Info')
    tab.set_title(2, 'Describe')
    tab.set_title(3, 'Missing Values')
    tab.set_title(4, 'Value Counts')
    
    display(controls, tab)

VBox(children=(Dropdown(description='Select DataFrame:', options=('patient', 'patient_reaction', 'patient_drug…

Tab(children=(Output(), Output(), Output(), Output(), Output()), _titles={'0': 'Head', '1': 'Info', '2': 'Desc…

## Notes and Further EDA Ideas

- **Data Type Specific EDA**: The current EDA is generic. For specific DataFrames (e.g., `patient_drug`, `patient_reaction`), more targeted analyses and visualizations could be added.
- **Visualizations**: 
    - Histograms/Bar charts for distributions of key numerical/categorical columns.
    - Scatter plots for relationships between numerical variables (if any make sense).
    - Time series plots if date information is relevant and parsed (e.g., `receiptdate`).
    - Using libraries like `matplotlib`, `seaborn`, or `plotly` for more advanced plotting.
- **Advanced Value Counts**: For columns with many unique values, consider grouping or showing only the most/least frequent.
- **Memory Management**: For very large combined DataFrames, consider sampling or using Dask for EDA if memory becomes an issue (though `low_memory=False` was used, full data is loaded here).