In [10]:
from palmerpenguins import load_penguins
import altair as alt
import pandas as pd

In [6]:
penguins = load_penguins()

In [7]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [27]:
penguins.select_dtypes(include=['datetime'])

penguins.select_dtypes(include=['number'])

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
0,39.1,18.7,181.0,3750.0,2007
1,39.5,17.4,186.0,3800.0,2007
2,40.3,18.0,195.0,3250.0,2007
3,,,,,2007
4,36.7,19.3,193.0,3450.0,2007
...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,2009
340,43.5,18.1,202.0,3400.0,2009
341,49.6,18.2,193.0,3775.0,2009
342,50.8,19.0,210.0,4100.0,2009


In [28]:
penguins.select_dtypes(include=['object'])

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,male
1,Adelie,Torgersen,female
2,Adelie,Torgersen,female
3,Adelie,Torgersen,
4,Adelie,Torgersen,female
...,...,...,...
339,Chinstrap,Dream,male
340,Chinstrap,Dream,female
341,Chinstrap,Dream,male
342,Chinstrap,Dream,male


In [59]:
alt.renderers.enable(embed_options={'theme': 'ggplot2'})

RendererRegistry.enable('default')

In [64]:
penguins.select_dtypes(include='number').columns.tolist()

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year']

In [60]:
alt.Chart(penguins).mark_bar().encode(
    alt.X('bill_length_mm', bin=alt.Bin(maxbins=50)), y='count()')

In [61]:
alt.Chart(penguins).mark_bar().encode(
    x=alt.X('count()'),
    y=alt.Y('species', sort='-x')    
)

In [117]:
def plot_basic_distb_numeric(df, col_name):
    return alt.Chart(penguins).mark_bar().encode(
            alt.X(col_name, bin=alt.Bin(maxbins=50)), y='count()')


def plot_basic_distb_string(df, col_name):
    return alt.Chart(penguins).mark_bar().encode(
            x=alt.X('count()'),
            y=alt.Y(col_name, sort='-x')    
            )


def plot_basic_distributions(df, cols=None, include=None, vega_theme="ggplot2"):
    """Takes a dataframe and generates plots based on types

    Parameters
    -----------
    df: pd.DataFrame
        Dataframe from which to generate plots for each column from
    cols: list, optional
        List of columns to generate plots for. By default, None (builds charts for all columns).
    include: string, optional
        Select the data types to include. Supported types include "string" and "number". By default, it will return both string and number columns.
    vega_theme : string, optional
        Select the vega.themes for the altair plots. The options include: excel, ggplot2, quartz, vox, fivethirtyeight, dark, latimes, urbaninstitute, and googlecharts. By default, it uses ggplot2.

    Returns
    -------
    dict_plots: dict of altair.Chart objects using the column name as the key
        dictionary of generated altair.Chart objects with the column name as the key

    Examples
    -------
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, 0],
                                    'num_wings': [2, 0, 0, 0],
                                    'num_specimen_seen': [10, 2, 1, 8]})
    >>> instaeda_py.plot_distribution(example_df)
    """
    dict_plots = {}
    df_data = None

    # Set vega theme
    alt.renderers.enable(embed_options={'theme': vega_theme})

    # First filter:  select columns
    if cols is None:    
        df_data = df
    else:
        df_data = df[cols]    
    
    # Second filter: select types to include
    if include == 'number' or include is None:
        
        df_data_number = df_data.select_dtypes(include="number")        
        for col in df_data_number.columns.tolist():
            dict_plots[col] = plot_basic_distb_numeric(df_data_number, col)
    
    if include == 'string' or include is None:

        df_data_string = df_data.select_dtypes(include="object")
        for col in df_data_string.columns.tolist():            
            dict_plots[col] = plot_basic_distb_string(df_data_string, col)            
            
    return dict_plots

In [137]:
dict_plot = plot_basic_distributions(penguins)
type(dict_plot['sex'])

altair.vegalite.v4.api.Chart

In [138]:
isinstance(dict_plot['sex'], alt.Chart)

True