In [2]:
from palmerpenguins import load_penguins
import altair as alt
import pandas as pd

In [3]:
penguins = load_penguins()

In [4]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [9]:
penguins.select_dtypes(include=['datetime'])

penguins.select_dtypes(include=['number'])

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
0,39.1,18.7,181.0,3750.0,2007
1,39.5,17.4,186.0,3800.0,2007
2,40.3,18.0,195.0,3250.0,2007
3,,,,,2007
4,36.7,19.3,193.0,3450.0,2007
...,...,...,...,...,...
339,55.8,19.8,207.0,4000.0,2009
340,43.5,18.1,202.0,3400.0,2009
341,49.6,18.2,193.0,3775.0,2009
342,50.8,19.0,210.0,4100.0,2009


In [10]:
penguins.select_dtypes(include=['object'])

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,male
1,Adelie,Torgersen,female
2,Adelie,Torgersen,female
3,Adelie,Torgersen,
4,Adelie,Torgersen,female
...,...,...,...
339,Chinstrap,Dream,male
340,Chinstrap,Dream,female
341,Chinstrap,Dream,male
342,Chinstrap,Dream,male


In [11]:
alt.renderers.enable(embed_options={'theme': 'ggplot2'})

RendererRegistry.enable('default')

In [12]:
penguins.select_dtypes(include='number').columns.tolist()

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year']

In [13]:
alt.Chart(penguins).mark_bar().encode(
    alt.X('bill_length_mm', bin=alt.Bin(maxbins=50)), y='count()')

In [14]:
alt.Chart(penguins).mark_bar().encode(
    x=alt.X('count()'),
    y=alt.Y('species', sort='-x')    
)

In [25]:
def plot_basic_distb_numeric(df, col_name):
    return alt.Chart(penguins).mark_bar().encode(
            alt.X(col_name, bin=alt.Bin(maxbins=50)), y='count()')


def plot_basic_distb_string(df, col_name):
    return alt.Chart(penguins).mark_bar().encode(
            x=alt.X('count()'),
            y=alt.Y(col_name, sort='-x')    
            )


def plot_basic_distributions(df, cols=None, include=None, vega_theme="ggplot2"):
    """Takes a dataframe and generates plots based on types

    Parameters
    -----------
    df: pd.DataFrame
        Dataframe from which to generate plots for each column from
    cols: list, optional
        List of columns to generate plots for. By default, None (builds charts for all columns).
    include: string, optional
        Select the data types to include. Supported values include None, "string" and "number". By default, None - it will return both string and number columns.
    vega_theme : string, optional
        Select the vega.themes for the altair plots. The options include: excel, ggplot2, quartz, vox, fivethirtyeight, dark, latimes, urbaninstitute, and googlecharts. By default, it uses ggplot2.

    Returns
    -------
    dict_plots: dict of altair.Chart objects using the column name as the key
        dictionary of generated altair.Chart objects with the column name as the key

    Examples
    -------
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, 0],
                                    'num_wings': [2, 0, 0, 0],
                                    'num_specimen_seen': [10, 2, 1, 8]})
    >>> instaeda_py.plot_distribution(example_df)
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("The df parameter must be a pandas dataframe")


    if vega_theme not in ('excel','ggplot2','quartz','vox','fivethirtyeight', 'dark', 'latimes', 'urbaninstitute', 'googlecharts'):
        warnings.warn("You have selected a theme that is not one of the default Vega color themes.")

    dict_plots = {}
    df_data = None

    # Set vega theme
    alt.renderers.enable(embed_options={'theme': vega_theme})

    # First filter:  select columns
    if cols is None:    
        df_data = df
    else:
        df_data = df[cols]    
    
    if include not in (None, 'number', 'string'):
        raise KeyError("The include parameter must be None, 'number' or 'string'")

    # Second filter: select types to include
    if include == 'number' or include is None:
        
        df_data_number = df_data.select_dtypes(include="number")        
        for col in df_data_number.columns.tolist():
            dict_plots[col] = alt.Chart(df_data_number).mark_bar().encode(
                                alt.X(col, bin=alt.Bin(maxbins=50)), y='count()')
    
    if include == 'string' or include is None:

        df_data_string = df_data.select_dtypes(include="object")
        for col in df_data_string.columns.tolist():            
            dict_plots[col] = alt.Chart(df_data_string).mark_bar().encode(
                                    x=alt.X('count()'),
                                    y=alt.Y(col, sort='-x')
                                )
            
    return dict_plots

In [33]:
dict_plot = plot_basic_distributions(penguins)
dict_plot['bill_length_mm'].mark, dict_plot['sex'].mark


('bar', 'bar')

In [53]:
len(dict_plot)

8

In [52]:

dict_plot['bill_length_mm'].encoding.y['shorthand']=='count()'
dict_plot['sex'].encoding.x['shorthand']=='count()'

len(dict_plot.keys()) == 8
list(dict_plot.keys()) == ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year', 'species', 'island', 'sex']

True

In [27]:
dict_plot

{'bill_length_mm': alt.Chart(...),
 'bill_depth_mm': alt.Chart(...),
 'flipper_length_mm': alt.Chart(...),
 'body_mass_g': alt.Chart(...),
 'year': alt.Chart(...),
 'species': alt.Chart(...),
 'island': alt.Chart(...),
 'sex': alt.Chart(...)}

# Divide & Fill prototypes

In [18]:
import numpy as np
from sklearn.impute import SimpleImputer

In [19]:
def divide_and_fill(
    dataframe,
    cols=None,
    missing_values=np.nan,
    strategy="mean",
    fill_value=None,
    random=False,
    parts=1,
    verbose=0,
):
    """Takes a dataframe, subsets selected columns and divides into parts for imputation of missing values and returns a data frame.

    Parameters
    -----------
    dataframe: pd.DataFrame
        Dataframe from which to take columns and check for missing values.
    cols: list, optional
        List of columns to perform imputation on. By default, None (perform on all numeric columns).
    missing_values: int, float, str, np.nan or None
        The placeholder for the missing values. All occurences of missing values will be imputed.
    strategy : string, optional
        imputation strategy, one of: {'mean', 'median', 'constant', 'most_frequent'}. By default, 'mean'.
    fill_value : string or numerical value, optional
        When strategy == 'constant', fill_value is used to replace all occurences of missing_values.
        If left to default, fill_value will be 0 when filling numerical data and 'missing' for strings or object data types.
    random : boolean, optional
        When random == True, shuffles data frame before filling. By default, False.
    parts : integer, optional
        The number of parts to divide rows of data frame into. By default, 1.
    verbose : integer, optional
        Controls the verbosity of the divide and fill. By default, 0.


    Returns
    -------
    dataframe : pandas.DataFrame object
        Data frame obtained after divide and fill on the corresponding columns.

    Examples
    -------
    >>> import numpy as np
    >>> from instaeda import divide_and_fill
    >>> example_df = pd.DataFrame({'animal': ['falcon', 'dog', 'spider', 'fish'],
                                    'num_legs': [2, 4, 8, np.nan],
                                    'num_wings': [2, np.nan, 0, 0],
                                    'num_specimen_seen': [10, 2, np.nan, np.nan]})
    >>> divide_and_fill(example_df)
    """
    filled_df = None
    allowed_strategies = ["mean", "median", "constant", "most_frequent"]

    # Checking inputs
    if verbose:
        print("Checking inputs")

    if not isinstance(dataframe, pd.DataFrame):
        raise Exception("The input data must be of type pandas.DataFrame!")

    if cols == None:
        cols = list(dataframe.select_dtypes(include="number").columns)

    if (
        not isinstance(cols, list)
        or not all(isinstance(x, str) for x in cols)
        or not set(cols).issubset(set(dataframe.columns))
    ):
        raise Exception(
            "The input cols must be a list of strings belong to the column names for input dataframe!"
        )

    if (
        not isinstance(missing_values, int)
        and not isinstance(missing_values, float)
        and not isinstance(missing_values, str)
        and (missing_values is not None)
    ):
        raise Exception(
            "The input missing values must be one of the following: (int, float, str, np.nan, None)"
        )

    if strategy not in allowed_strategies:
        raise ValueError(
            "Can only use these strategies: {0} got strategy = {1}".format(
                allowed_strategies, strategy
            )
        )

    if (
        (fill_value is not None)
        and not isinstance(fill_value, int)
        and not isinstance(fill_value, float)
        and not isinstance(fill_value, str)
    ):
        raise Exception(
            "The input fill values must be one of the following: (int, float, str, None)"
        )

    if not isinstance(random, bool):
        raise Exception("The input random must be True or False")

    if not isinstance(parts, int) or (parts < 1):
        raise ValueError("Can only use positive integer parts.")

    if not isinstance(verbose, int):
        raise ValueError("Can only use integer for verbose.")

    # Constructing filled dataframe skeleton.
    if verbose:
        print("Constructing filled dataframe skeleton.")

    if random:
        filled_df = dataframe.copy().sample(frac=1).reset_index(drop=True)
    else:
        filled_df = dataframe.copy()

    if (set(cols) <= set(dataframe.select_dtypes(include="number").columns)):
        if isinstance(fill_value, str) :
            raise ValueError(
                "For numeric columns, can only use fill values: (int, float, None)"
            )
    elif (set(cols) <= set(dataframe.select_dtypes(exclude="number").columns)):
        if isinstance(fill_value, int) or isinstance(fill_value, float):
            raise ValueError(
                "For non-numeric columns, can only use fill values: (None, str)"
            )
    else:
        raise Exception("All items in list cols must be numeric, or non-numeric.")

    # Filling data frame
    spacing = filled_df.shape[0]/(parts + 1)
    indexing = np.arange(
        0, filled_df.shape[0] + spacing, spacing, dtype=int
    )
    
    for i in range(len(indexing) - 1):
        imputer = SimpleImputer(
            missing_values=missing_values, strategy=strategy, fill_value=fill_value
        )
        filled_df.loc[indexing[i] : indexing[i + 1], cols] = imputer.fit_transform(
            filled_df.loc[indexing[i] : indexing[i + 1], cols]
        )

    if verbose:
        print("Returning data frame.")
    return filled_df

In [20]:
na_numerical_dataframe = pd.DataFrame(
    {"col_1": [1, 2], "col_2": [np.nan, 0.9], "col_3": ["a", "b"]}
)

In [21]:
df_new = pd.DataFrame({'a':[1,2,3],'b':[3,np.nan,4],'c':[2,np.nan,np.nan]})

In [22]:
df_new

Unnamed: 0,a,b,c
0,1,3.0,2.0
1,2,,
2,3,4.0,


In [23]:
divide_and_fill(df_new)

Unnamed: 0,a,b,c
0,1.0,3.0,2.0
1,2.0,3.0,2.0
2,3.0,4.0,2.0


In [24]:
from pandas._testing import assert_frame_equal
assert_frame_equal(divide_and_fill(penguins, cols = ['year']), penguins, check_dtype = False)