In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import glob
# Requires paramnb package to be installed
import param,paramnb

# Contents
### Data Frame
<div style="margin-left:50px">
<a href="#filesel">File selector</a><br>
<a href="#shape">Shape</a><br>
<a href="#cols">Column names and data types</a><br>
<a href="#preview">Preview</a><br>
</div>

### Column
<div style="margin-left:50px">
<a href="#colsel">Column selector</a><br>
<a href="#coldescription">Description</a><br>
<a href="#colvalues">Values count</a><br>
</div>

### Filter
<div style="margin-left:50px">
<a href="#filtersel">Filter selector</a><br>
<a href="#filteredprev">Filtered preview</a><br>
<a href="#filteredcolsel">Filtered column selector</a><br>
<a href="#filtereddesc">Filtered description</a><br>
<a href="#filteredvalues">Filtered values count</a><br>
</div>


### Plots
<div style="margin-left:50px">
<a href="#intersel">Interaction selector</a><br>
<a href="#scatter">Scatter plot</a><br>
</div>

In [None]:
# Define global variables

available_encodings=[
"utf-8",
"latin1"
]

available_dtypes=[
    "Do not convert",
    "Numeric (remove commas)"
]

class DataConverters():
    def remove_commas(series):
        for i,val in enumerate(series):
            if type(val)=="str":
                series[i]=str(series[i]).replace(",","")
        return series

# <font color="goldenrod">Enter a list of input files here to get started </font>

<a name="filesel"></a>

In [None]:
# A list of input files

# obtained by wildcard
#path2files=glob.glob("")

# OR from a list
path2files=[
    
]

class FileSelector(param.Parameterized):
    selected_file=param.ObjectSelector(objects=path2files)
    file_encoding=param.ObjectSelector(default="utf-8",objects=available_encodings)

paramnb.Widgets(FileSelector,next_n=4)

<a name="shape"></a>

In [None]:
# Read file into a data frame
if FileSelector.selected_file.endswith("csv"):
    df=pd.read_csv(FileSelector.selected_file,encoding=FileSelector.file_encoding)
elif FileSelector.selected_file.endswith("xlsx") or FileSelector.selected_file.endswith("xls"):
    df=pd.read_excel(FileSelector.selected_file,encoding=FileSelector.file_encoding)

print("Rows: {}\tCols: {}".format(df.shape[0],df.shape[1]))

<a name="cols"></a>

In [None]:
pd.DataFrame({"Column Names":df.columns.values,"Data Types":df.dtypes.values})

<a name="preview"></a>

In [None]:
df.sample(10).transpose()

# <font color="goldenrod">Select a column to take a closer look</font>

<a name="colsel"></a>

In [None]:
class ColumnSelector(param.Parameterized):
    selected_col=param.ObjectSelector(objects=df.columns.values)
    convert_dtype=param.ObjectSelector(default="Do not convert",objects=available_dtypes)
    
paramnb.Widgets(ColumnSelector,next_n=3)

<a name="coldescription"></a>

In [None]:
if not ColumnSelector.convert_dtype=="Do not convert":
    if ColumnSelector.convert_dtype=="Numeric (remove commas)":
        column_series=DataConverters.remove_commas(df[ColumnSelector.selected_col])
        column_series=pd.to_numeric(column_series,errors="coerce")
else:
    column_series=df[ColumnSelector.selected_col]

print(column_series.describe())

if np.issubdtype(column_series.dtype,np.number):
    plt.boxplot(column_series.dropna())

<a name="colvalues"></a>

In [None]:
if np.issubdtype(column_series.dtype,np.number):
    print("NA count: {}".format(np.sum(pd.isnull(column_series))))
    plt.hist(column_series.dropna())
else:
    print("NA count: {}".format(np.sum(pd.isnull(column_series))))
    print(pd.DataFrame(column_series.value_counts(dropna=False)))

# <font color="goldenrod">Look at data filtered by the selected column</font>

<a name="filtersel"></a>

In [None]:
if np.issubdtype(column_series.dtype,np.number):
    class FilterSelector(param.Parameterized):
        filter_by_value=param.Range(bounds=(column_series.min(), column_series.max()))
        filter_by_na=param.Boolean()
else:    
    class FilterSelector(param.Parameterized):
        filter_by_value=param.ObjectSelector(objects=column_series.values)
        filter_by_na=param.Boolean()

if not np.issubdtype(column_series.dtype,np.number):
# May take too long to load
    if len(column_series.unique())<2000:
        paramnb.Widgets(FilterSelector,next_n=2)
else:
    paramnb.Widgets(FilterSelector,next_n=2)

<a name="filteredprev"></a>

In [None]:
if FilterSelector.filter_by_na==True:
    subdf=df[df[ColumnSelector.selected_col].isnull()]
    display(subdf.sample(10).transpose())
else:
    if np.issubdtype(column_series.dtype,np.number):
        subdf=df[np.array(df[ColumnSelector.selected_col]>=FilterSelector.filter_by_value[0]) & np.array(df[ColumnSelector.selected_col]<=FilterSelector.filter_by_value[1])]
        display(subdf.sample(10).transpose())
    else:
        subdf=df[df[ColumnSelector.selected_col]==FilterSelector.filter_by_value]
        display(subdf.sample(10).transpose())

# <font color="goldenrod">Distribution of a column after filtering</font>

<a name="filteredcolsel"></a>

In [None]:
class FilteredColumnSelector(param.Parameterized):
    selected_col=param.ObjectSelector(objects=df.columns.values)
    convert_dtype=param.ObjectSelector(default="Do not convert",objects=available_dtypes)
    
paramnb.Widgets(FilteredColumnSelector,next_n=2)

<a name="filtereddesc"></a>

In [None]:
if not FilteredColumnSelector.convert_dtype=="Do not convert":
    if FilteredColumnSelector.convert_dtype=="Numeric (remove commas)":
        filtered_column_series=DataConverters.remove_commas(subdf[FilteredColumnSelector.selected_col])
        filtered_column_series=pd.to_numeric(filtered_column_series,errors="coerce")
else:
    filtered_column_series=subdf[FilteredColumnSelector.selected_col]

print(filtered_column_series.describe())

if np.issubdtype(filtered_column_series.dtype,np.number):
    plt.boxplot(filtered_column_series.dropna())

<a name="filteredvalues"></a>

In [None]:
if np.issubdtype(filtered_column_series.dtype,np.number):
    print("NA count: {}".format(np.sum(pd.isnull(filtered_column_series))))
    plt.hist(filtered_column_series.dropna())
else:
    print("NA count: {}".format(np.sum(pd.isnull(filtered_column_series))))
    print(pd.DataFrame(filtered_column_series.value_counts(dropna=False)))

# <font color="goldenrod">Interaction between two variables</font>

<a name="intersel"></a>

In [None]:
class InteractionSelector(param.Parameterized):
    first_variable=param.ObjectSelector(objects=df.columns.values)
    second_variable=param.ObjectSelector(objects=df.columns.values)
    invert_axis=param.Boolean()
    
paramnb.Widgets(InteractionSelector,next_n=1)

<a name="scatter"></a>

In [None]:
# Scatter plot
plt.figure(figsize=(10,10))
if not InteractionSelector.invert_axis==True:
    plt.scatter(df.dropna()[InteractionSelector.first_variable],df.dropna()[InteractionSelector.second_variable],marker='x',s=2)
    plt.xlabel(InteractionSelector.first_variable)
    plt.ylabel(InteractionSelector.second_variable)
else:
    plt.scatter(df.dropna()[InteractionSelector.second_variable],df.dropna()[InteractionSelector.first_variable],marker="x",s=2)
    plt.ylabel(InteractionSelector.first_variable)
    plt.xlabel(InteractionSelector.second_variable)

# Do not change codes above, insert your custom codes below