In [79]:
from vega_datasets import data
import altair as alt
import pandas as pd

In [2]:
df = data.iris()
df.columns[0:-1]

Index(['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'], dtype='object')

In [3]:
alt.Chart(df).mark_point().encode(
    y = 'species',
    x=alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
).repeat(repeat=list(df.columns[0:-1]))

In [4]:
# (alt.Chart(movies)
#  .transform_density(
#     'runtime',
#     as_=['runtime', 'density'])  # Give a name to the KDE values, which we can use when plotting
#  .mark_area(interpolate='monotone').encode(
#     x='runtime',
#     y='density:Q'))
single = alt.selection_single()



chart1 = alt.Chart(df).transform_density(
    'sepalLength',
    as_=['sepalLength', 'density'],
    groupby=['species']
    ).mark_area(interpolate='monotone', opacity=0.7).encode(
    y = 'density:Q',
    x = alt.X('sepalLength'), #alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
    color='species'
) 


features = ['sepalLength','sepalWidth', 'petalWidth']
chartlist = []

for feat in features:
    chart = alt.Chart(df).transform_density(
        feat,
        as_=[feat, 'density'],
        groupby=[]
        ).mark_area(interpolate='monotone', opacity=0.7).encode(
        y = 'density:Q',
        x = alt.X(feat), #alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
        color='species'
    ) 
    chartlist.append(chart)

for i in range(len(chartlist)):  
    print(i)
    if i == 0:
        output = chartlist[i]
    else:
        output = alt.vconcat(output, chartlist[i])

output
        

0
1
2


In [185]:
def arid_eda(data_frame, response, response_type, features=[]):
    """
    
    Function to create summary statistics and basic EDA plots. Given a data frame,
    this function outputs general exploratory analysis plots as well as basic 
    statistics summarizing trends in the features of the input data. 
    
    Parameters
    ----------
    data_frame : pandas.DataFrame
        The input dataframe to analyze
    response : str
        A column name of the response variable
    features : list
        A list of the feature names to perform EDA on
    
    Returns
    -------
    altair.Chart
        Plots relevant to the exploratory data analysis
    
    pandas.DataFrame
        A dataframe containing summary statistics relevant to the 
        selected feature and response variable.
        
    Examples
    --------
    >>> from aridanalysis import aridanalysis
    >>> dataframe, plots = arid_eda(house_prices, 'price', ['rooms', 'age','garage'])
    
    """
    chartlist = []
    plot_width = 70*len(features)
    plot_height = 70*len(features)
    
    
    if response_type == 'categorical':
        for feat in features:                            ### This function creates density plots for each feature 
            chart = alt.Chart(df).transform_density(     ### only works currently if response is categorical 
                feat,
                as_=[feat, 'density'],
                groupby=[response]
                ).mark_area(interpolate='monotone', opacity=0.7).encode(
                y = 'density:Q',
                x = alt.X(feat),
                color=response
            ) 
            chartlist.append(chart)
    
    elif response_type == 'continuous':
    
        for feat in features: 
            chart = alt.Chart(df).mark_bar().encode(
                y = 'count()',
                x = alt.X(feat, bin=alt.Bin())#alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
            ).properties(width=200, height=200)
            chartlist.append(chart)

    
    for i in range(len(chartlist)):  
        if i == 0:
            dist_output = chartlist[i]
        elif i % 2 == 0:
            dist_output = alt.vconcat(dist_output, chartlist[i])
        elif i % 2 == 1:
            dist_output = alt.hconcat(dist_output, chartlist[i])

    feature_df = df.loc[:,features]
    corr_df = feature_df.corr('spearman').stack().reset_index(name='corr')
    corr_df.loc[corr_df['corr'] == 1, 'corr'] = 0
    corr_df['abs'] = corr_df['corr'].abs()
    corr_square = alt.Chart(corr_df).mark_rect().encode(
        x='level_0',
        y='level_1',
        color=alt.Color('corr', scale=alt.Scale(scheme='blueorange'))   
    ).properties(width=plot_width, height=plot_height)
    
    
    corr_text = alt.Chart(corr_df).mark_text().encode(
        x='level_0',
        y='level_1',
        text='corr',
        color = 'black'
#         color=alt.condition(
#             alt.datum.correlation > 0.5, 
#             alt.value('white'),
#             alt.value('black')
        )
    core_plot = corr_square + corr_text   
        
     
    #dist_output | corr_plot
    return pd.DataFrame(corr_df), dist_output | corr_plot

In [218]:
def arid_eda(data_frame, response, response_type, features=[]):
    """
    
    Function to create summary statistics and basic EDA plots. Given a data frame,
    this function outputs general exploratory analysis plots as well as basic 
    statistics summarizing trends in the features of the input data. 
    
    Parameters
    ----------
    data_frame : pandas.DataFrame
        The input dataframe to analyze
    response : str
        A column name of the response variable
    features : list
        A list of the feature names to perform EDA on
    
    Returns
    -------
    altair.Chart
        Plots relevant to the exploratory data analysis
    
    pandas.DataFrame
        A dataframe containing summary statistics relevant to the 
        selected feature and response variable.
        
    Examples
    --------
    >>> from aridanalysis import aridanalysis
    >>> dataframe, plots = arid_eda(house_prices, 'price', ['rooms', 'age','garage'])
    
    """
    chartlist = []
    plot_width = 70*len(features)
    plot_height = 70*len(features)
    
    
    if response_type == 'categorical':
        for feat in features:                            ### This function creates density plots for each feature 
            chart = alt.Chart(df).transform_density(     ### only works currently if response is categorical 
                feat,
                as_=[feat, 'density'],
                groupby=[response]
                ).mark_area(interpolate='monotone', opacity=0.7).encode(
                y = 'density:Q',
                x = alt.X(feat),
                color=response
            ) 
            chartlist.append(chart)
    
    elif response_type == 'continuous':
    
        for feat in features: 
            chart = alt.Chart(df).mark_bar().encode(
                y = 'count()',
                x = alt.X(feat, bin=alt.Bin())#alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
            ).properties(width=200, height=200)
            chartlist.append(chart)

    
    for i in range(len(chartlist)):  
        if i == 0:
            dist_output = chartlist[i]
        elif i % 2 == 0:
            dist_output = alt.vconcat(dist_output, chartlist[i])
        elif i % 2 == 1:
            dist_output = alt.hconcat(dist_output, chartlist[i])

    feature_df = df.loc[:,features]
    corr_df = feature_df.corr('spearman').stack().reset_index(name='corr')
    corr_df.loc[corr_df['corr'] == 1, 'corr'] = 0
    corr_df['corr_label'] = corr_df['corr'].map('{:.2f}'.format)
    corr_df['abs'] = corr_df['corr'].abs()
    
    base = alt.Chart(corr_df).encode(
            x='level_0',
            y='level_1'    
        ).properties(width=plot_width, height=plot_height)

    # Text layer with correlation labels
    # Colors are for easier readability
    text = base.mark_text().encode(
        text='corr_label',
        color=alt.value('white')
    )

    # The correlation heatmap itself
    cor_sq = base.mark_rect().encode(
        color=alt.Color('corr', scale=alt.Scale(scheme='blueorange'))   
    )

    corr_plot = cor_sq + text

        
    
    return pd.DataFrame(corr_df), dist_output | corr_plot

In [219]:
dataframe, plots = arid_eda(df, 'species', 'categorical', features = ['sepalWidth', 'petalWidth', 'petalLength'])
plots

In [57]:
charts = arid_eda(df, 'sepalLength', 'continuous', features = ['sepalWidth', 'petalWidth', 'petalLength'])
alt.vconcat(charts[1]|charts[2],charts[0])

In [8]:
plot_width = 70*len(features)
plot_height = 70*len(features)

feature_df = df.loc[:,features]
corr_df = feature_df.corr('spearman').stack().reset_index(name='corr')
corr_df.loc[corr_df['corr'] == 1, 'corr'] = 0
corr_df['abs'] = corr_df['corr'].abs()
corr_plot = alt.Chart(corr_df).mark_rect().encode(
    x='level_0',
    y='level_1',
    size='abs',
    color=alt.Color('corr', scale=alt.Scale(scheme='blueorange'))
).properties(width=plot_width, height=plot_height)

corr_plot

In [18]:
df

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [40]:
for feat in features: 
    chart = alt.Chart(df).mark_bar().encode(
        y = 'count()',
        x = alt.X('petalWidth', bin=alt.Bin())#alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
    )


In [None]:
single = alt.selection_single()

chart1 = alt.Chart(df).mark_area().encode(
    y = 'count()',
    x = alt.X('sepalLength', bin=alt.Bin())#alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
    ,fill=alt.condition(single, 'species', alt.value('lightgray'))
).add_selection(single)

chart2 = alt.Chart(df.query('species=="setosa"')).mark_bar().encode(
    y = 'count()',
    x = alt.X('sepalLength', bin=alt.Bin())#alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
)
chart3 = alt.Chart(df.query('species=="versicolor"')).mark_bar().encode(
    y = 'count()',
    x = alt.X('sepalLength', bin=alt.Bin())#alt.X(alt.repeat(), type='quantitative', scale=alt.Scale(zero=False)),
)


In [None]:
chart1


In [None]:


chart2