In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import glob
# Requires paramnb package to be installed
import param,paramnb

# Contents
### Data Frame
<div style="margin-left:50px">
<a href="#filesel">File selector</a><br>
<a href="#shape">Shape</a><br>
<a href="#cols">Column names and data types</a><br>
<a href="#preview">Preview</a><br>
</div>

### Column
<div style="margin-left:50px">
<a href="#colsel">Column selector</a><br>
<a href="#coldescription">Description</a><br>
<a href="#colvalues">Values count</a><br>
</div>

### Filter
<div style="margin-left:50px">
<a href="#colsel">Column selector</a><br>
</div>

In [None]:
# Define global variables

available_encodings=[
"utf-8",
"latin1"
]

available_dtypes=[
    "Do not convert",
    "Numeric (remove commas)"
]

class DataConverters():
    def remove_commas(series):
        for i,val in enumerate(series):
            if type(val)=="str":
                series[i]=str(series[i]).replace(",","")
        return series

# <font color="goldenrod">Enter a list of input files here to get started </font>

<a name="filesel"></a>

In [None]:
# A list of input files

# obtained by wildcard
#path2files=glob.glob("")

# OR from a list
path2files=[
    
]

class FileSelector(param.Parameterized):
    selected_file=param.ObjectSelector(objects=path2files)
    file_encoding=param.ObjectSelector(default="utf-8",objects=available_encodings)

paramnb.Widgets(FileSelector,next_n=4)

<a name="shape"></a>

In [None]:
# Read file into a data frame
if FileSelector.selected_file.endswith("csv"):
    df=pd.read_csv(FileSelector.selected_file,encoding=FileSelector.file_encoding)
elif FileSelector.selected_file.endswith("xlsx") or FileSelector.selected_file.endswith("xls"):
    df=pd.read_excel(FileSelector.selected_file,encoding=FileSelector.file_encoding)

print("Rows: {}\tCols: {}".format(df.shape[0],df.shape[1]))

<a name="cols"></a>

In [None]:
pd.DataFrame({"Column Names":df.columns.values,"Data Types":df.dtypes.values})

<a name="preview"></a>

In [None]:
df.sample(10).transpose()

# <font color="goldenrod">Select a column to take a closer look</font>

<a name="colsel"></a>

In [None]:
class ColumnSelector(param.Parameterized):
    selected_col=param.ObjectSelector(objects=df.columns.values)
    convert_dtype=param.ObjectSelector(default="Do not convert",objects=available_dtypes)
    
paramnb.Widgets(ColumnSelector,next_n=3)

<a name="coldescription"></a>

In [None]:
if not ColumnSelector.convert_dtype=="Do not convert":
    if ColumnSelector.convert_dtype=="Numeric (remove commas)":
        column_series=DataConverters.remove_commas(df[ColumnSelector.selected_col])
        column_series=pd.to_numeric(column_series,errors="coerce")
else:
    column_series=df[ColumnSelector.selected_col]

print(column_series.describe())

if np.issubdtype(column_series.dtype,np.number):
    plt.boxplot(column_series.dropna())

<a name="colvalues"></a>

In [None]:
if np.issubdtype(column_series.dtype,np.number):
    print("NA count: {}".format(np.sum(pd.isnull(column_series))))
    plt.hist(column_series.dropna())
else:
    print("NA count: {}".format(np.sum(pd.isnull(column_series))))
    print(pd.DataFrame(column_series.value_counts(dropna=False)))

# <font color="goldenrod">Look at data filtered by the selected column</font>

In [None]:
if np.issubdtype(column_series.dtype,np.number):
    class FilterSelector(param.Parameterized):
        filter_by_value=param.Range(bounds=(column_series.min(), column_series.max()))
else:    
    class FilterSelector(param.Parameterized):
        filter_by_value=param.ObjectSelector(objects=column_series.values)

if not np.issubdtype(column_series.dtype,np.number):
# May take too long to load
    if len(column_series.unique())<2000:
        paramnb.Widgets(FilterSelector,next_n=1)
else:
    paramnb.Widgets(FilterSelector,next_n=1)

In [None]:
if np.issubdtype(column_series.dtype,np.number):
    display(df[list(df[ColumnSelector.selected_col]>=FilterSelector.filter_by_value[0]) and list(df[ColumnSelector.selected_col]<=FilterSelector.filter_by_value[1])].sample(10).transpose())
else:
    display(df[df[ColumnSelector.selected_col]==FilterSelector.filter_by_value].sample(10).transpose())

# <font color="goldenrod">Distribution of a column after filtering</font>

# Do not change codes above, insert your custom codes below