In [None]:
# Native
import os
import re
import shelve
import glob
import libmod

# Data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Miscellaneous
import param
import paramnb
import xlsxwriter

# Contents
### Data Frame
<div style="margin-left:50px">
<a href="#filesel">File selector</a><br>
<a href="#shape">Shape</a><br>
<a href="#cols">Column names and data types</a><br>
<a href="#preview">Preview</a><br>
</div>

### Column
<div style="margin-left:50px">
<a href="#colsel">Column selector</a><br>
<a href="#coldescription">Descriptive statistics</a><br>
<a href="#colvalues">Values count</a><br>
<a href="#quantile">Quantiles and rare levels</a><br>
<a href="#outlier">Outliers and rare level examples</a><br>
</div>

### Filter
<div style="margin-left:50px">
<a href="#filtersel">Filter selector</a><br>
<a href="#filteredprev">Filtered preview</a><br>
<a href="#filteredcolsel">Filtered column selector</a><br>
<a href="#filtereddesc">Filtered descriptive statistics</a><br>
<a href="#filteredvalues">Filtered values count</a><br>
</div>


### Plots
<div style="margin-left:50px">
<a href="#intersel">Interaction selector</a><br>
<a href="#scatter">Scatter plot</a><br>
</div>

In [None]:
"""Load code dependencies

If your dataset won't load, try

adding encodings;

If you need to convert a column and no existing methods fit,  
add new converters by adding a method in class "DataConverters",
then add the name and function in the dictionary "custom_converters".
"""

# Add your custom encodings here
available_encodings=[
"utf-8",
"latin1",
]

# Add new data converter functions here
class DataConverters(libmod.DataConverters):
    def custom(orig_series):
        """Sample for extending DataConverters class
        Outputs input series with all cells blank 
        """
        # Do not change original data frame, make a copy
        series=orig_series.reindex_like(orig_series)
        return series
    
# Add new column converters here
custom_converters={
    "Custom": DataConverters.custom,
    }

converters={**libmod.Explore.converters,**custom_converters}

# <font color="goldenrod">Select a list of input files here to get started </font>

<a name="filesel"></a>

In [None]:
# Run Tk to get file(s) path
inputlib=libmod.Input()
inputlib.path_gui()
# If Tk won't run, use the fallback below

# obtained by wildcard
# inputlib.path2files=glob.glob("???")

# OR from a list
# inputlib.path2files=[
#
#]

class FileSelector(param.Parameterized):
    selected_file=param.ObjectSelector(objects=inputlib.path2files)
    file_encoding=param.ObjectSelector(default="utf-8",objects=available_encodings)

paramnb.Widgets(FileSelector,next_n=3)

<a name="shape"></a>

In [None]:
# Read file into a data frame
df=inputlib.universal_load(FileSelector.selected_file,enc=FileSelector.file_encoding)

print("Rows: {}\tCols: {}".format(df.shape[0],df.shape[1]))

df_dtype=pd.DataFrame({"Column Names":df.columns.values,"Data Types":df.dtypes.values})
display(df_dtype)

<a name="cols"></a>

<a name="preview"></a>

In [None]:
libmod.smart_sample(df,10).transpose()

# <font color="goldenrod">Select a column to take a closer look</font>

<a name="colsel"></a>

In [None]:
class ColumnSelector(param.Parameterized):
    selected_col=param.ObjectSelector(objects=df.columns.values)
    convert_method=param.ObjectSelector(default="Do not convert",objects=converters.keys())
    
paramnb.Widgets(ColumnSelector,next_n=4)

<a name="coldescription"></a>

In [None]:
if not ColumnSelector.convert_method=="Do not convert":
    column_series=converters[ColumnSelector.convert_method](df[ColumnSelector.selected_col])
    column_series=pd.to_numeric(column_series,errors="coerce")
else:
    column_series=df[ColumnSelector.selected_col]

col_nacount=np.sum(pd.isnull(column_series))
print("Null count: {}\tNull percent: {:.2%}".format(col_nacount,col_nacount/df.shape[0]))
    
col_desc_df=column_series.describe().to_frame().transpose()
display(col_desc_df)

if not libmod.is_factor(column_series):
    # Save figure to cache
    plt.boxplot(column_series.dropna())
    colboxfig=plt.gcf()
    plt.close()
    plt.hist(column_series.dropna(),bins=30)
    colhistfig=plt.gcf()
    plt.close()
    # Make plot
    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    plt.boxplot(column_series.dropna())
    plt.subplot(1,2,2)
    plt.hist(column_series.dropna(),bins=30)    
else:
    valcount_df=column_series.value_counts(dropna=False).to_frame()
    display(valcount_df)

<a name="colvalues"></a>

<a name="quantile"></a>

In [None]:
# Quantiles
if not libmod.is_factor(column_series):
    quantile_list=[
        column_series.quantile(.005),
        column_series.quantile(.01),
        column_series.quantile(.025),
        column_series.quantile(.05),
        column_series.quantile(.95),
        column_series.quantile(.975),
        column_series.quantile(.99),
        column_series.quantile(.995),
    ]
    quantile_df=pd.Series(quantile_list,index=["0.5%","1%","2.5%","5%","95%","97.5%","99%","99.5%"]).to_frame().transpose()
    print("Quantiles")
    display(quantile_df)
else:
# Rare levels
    print("Rare levels")
    rare_lvl_df=valcount_df.sort_values(by=[ColumnSelector.selected_col],ascending=True)
    display(rare_lvl_df.head(10))
    rare_lvl=rare_lvl_df.index[0]

<a name="outlier"></a>

In [None]:
# Outliers
if not libmod.is_factor(column_series):
    q3=column_series.quantile(.75)
    q1=column_series.quantile(.25)
    IQR=q3-q1
    upper_fence=q3+1.5*IQR
    lower_fence=q1-1.5*IQR
    outlier_upper_df=df.loc[column_series[column_series>upper_fence].sort_values(ascending=False).index]
    outlier_lower_df=df.loc[column_series[column_series<lower_fence].sort_values(ascending=True).index]
    print("Outliers above upper fence")
    display(outlier_upper_df.head(10).transpose())
    print("Outliers below lower fence")
    display(outlier_lower_df.head(10).transpose())
else:
    print("Examples of rare levels")
    rare_exp_df=df[df[ColumnSelector.selected_col]==rare_lvl]
    display(rare_exp_df.head(10))

# <font color="goldenrod">Look at data filtered by the selected column</font>

<a name="filtersel"></a>

In [None]:
nadf=df[df[ColumnSelector.selected_col].isnull()]

if not libmod.is_factor(column_series):
    class FilterSelector(param.Parameterized):
        filter_by_value=param.Range(bounds=(column_series.min(), column_series.max()))
        filter_by_na=param.Boolean()
else:    
    class FilterSelector(param.Parameterized):
        filter_by_value=param.ObjectSelector(objects=column_series.values)
        filter_by_na=param.Boolean()

if libmod.is_factor(column_series):
# May take too long to load
    if len(column_series.unique())<2000:
        paramnb.Widgets(FilterSelector,next_n=2)
else:
    paramnb.Widgets(FilterSelector,next_n=2)

<a name="filteredprev"></a>

In [None]:
if FilterSelector.filter_by_na==True:
    subdf=nadf
else:
    if not libmod.is_factor(column_series):
        subdf=df[(column_series>=FilterSelector.filter_by_value[0]) & (column_series<=FilterSelector.filter_by_value[1])]
    else:
        subdf=df[column_series==FilterSelector.filter_by_value]
        
print("Filtered rows: {}\tFiltered percent: {:.2%}".format(subdf.shape[0],subdf.shape[0]/df.shape[0]))
display(libmod.smart_sample(subdf,10).transpose())

# <font color="goldenrod">Distribution of a column after filtering</font>

<a name="filteredcolsel"></a>

In [None]:
class FilteredColumnSelector(param.Parameterized):
    selected_col=param.ObjectSelector(objects=df.columns.values)
    convert_method=param.ObjectSelector(default="Do not convert",objects=converters.keys())
    
paramnb.Widgets(FilteredColumnSelector,next_n=2)

<a name="filtereddesc"></a>

In [None]:
if not FilteredColumnSelector.convert_method=="Do not convert":
    filtered_column_series=converters[FilteredColumnSelector.convert_method](subdf[FilteredColumnSelector.selected_col])
    filtered_column_series=pd.to_numeric(filtered_column_series,errors="coerce")
else:
    filtered_column_series=subdf[FilteredColumnSelector.selected_col]

print(filtered_column_series.describe())

if not libmod.is_factor(filtered_column_series):
    plt.boxplot(filtered_column_series.dropna())

<a name="filteredvalues"></a>

In [None]:
if not libmod.is_factor(filtered_column_series):
    print("NA count: {}".format(np.sum(pd.isnull(filtered_column_series))))
    plt.hist(filtered_column_series.dropna(),bins=30)
else:
    print("NA count: {}".format(np.sum(pd.isnull(filtered_column_series))))
    print(pd.DataFrame(filtered_column_series.value_counts(dropna=False)))

# <font color="goldenrod">Interaction between two variables</font>

<a name="intersel"></a>

In [None]:
class InteractionSelector(param.Parameterized):
    first_variable=param.ObjectSelector(objects=df.columns.values)
    second_variable=param.ObjectSelector(objects=df.columns.values)
    invert_axis=param.Boolean()
    
paramnb.Widgets(InteractionSelector,next_n=1)

<a name="scatter"></a>

In [None]:
# Scatter plot
plt.figure(figsize=(10,10))
if not InteractionSelector.invert_axis==True:
    plt.scatter(df.dropna()[InteractionSelector.first_variable],df.dropna()[InteractionSelector.second_variable],marker='x',s=2)
    plt.xlabel(InteractionSelector.first_variable)
    plt.ylabel(InteractionSelector.second_variable)
else:
    plt.scatter(df.dropna()[InteractionSelector.second_variable],df.dropna()[InteractionSelector.first_variable],marker="x",s=2)
    plt.ylabel(InteractionSelector.first_variable)
    plt.xlabel(InteractionSelector.second_variable)

# <font color="goldenrod">Generate excel report</font>

In [None]:
# Column report

rowptr=1

# Write output to a xlsx file named "report - [column name]" in reports folder
if not os.path.exists("reports"):
    os.makedirs("reports")
writer = pd.ExcelWriter('reports/report - {}.xlsx'.format(ColumnSelector.selected_col), engine='xlsxwriter')

sheetname=ColumnSelector.selected_col if len(ColumnSelector.selected_col)<31 else ColumnSelector.selected_col[:30]
workbook  = writer.book
worksheet = workbook.add_worksheet(sheetname)
writer.sheets[sheetname] = worksheet

libmod.make_heading("A",rowptr, ColumnSelector.selected_col,workbook,worksheet)
rowptr+=2


# Descriptive data frame
libmod.make_heading("A",rowptr, "Descriptive statistics",workbook,worksheet)
rowptr+=1
col_desc_df.to_excel(writer, sheet_name=sheetname,startrow=rowptr-1,index=False)
rowptr+=3

# Quantile
if not libmod.is_factor(column_series):
    libmod.make_heading("A",rowptr, "Quantiles",workbook,worksheet)
    rowptr+=1
    quantile_df.to_excel(writer, sheet_name=sheetname,startrow=rowptr-1,index=False)
    rowptr+=3

# Value count
if libmod.is_factor(column_series):
    libmod.make_heading("A",rowptr,"Value counts",workbook,worksheet)
    rowptr+=1
    valcount_df.transpose().to_excel(writer, sheet_name=sheetname,startrow=rowptr-1)
    rowptr+=3
else:
# Plots
    libmod.make_heading('A',rowptr, "Box plot of {} values".format(ColumnSelector.selected_col),workbook,worksheet)
    libmod.make_heading('K',rowptr, "Histogram of {} values".format(ColumnSelector.selected_col),workbook,worksheet)
    rowptr+=1
    libmod.make_plot("A",rowptr,colboxfig,worksheet)
    libmod.make_plot("K",rowptr,colhistfig,worksheet)
    rowptr+=18
    
# Rare levels
if libmod.is_factor(column_series):
    libmod.make_heading("A",rowptr, "Some rare levels",workbook,worksheet)
    rowptr+=1
    rare_lvl_df.head(10).transpose().to_excel(writer, sheet_name=sheetname,startrow=rowptr-1,index=False)
    rowptr+=3

# Print NA examples
libmod.make_heading("A",rowptr, "Some examples with NA value",workbook,worksheet)
rowptr+=1
worksheet.write("A{}".format(rowptr),"NA count in this column: {}".format(col_nacount))
rowptr+=1

libmod.smart_sample(nadf,10).to_excel(writer, sheet_name=sheetname,startrow=rowptr-1,index=False)
rowptr+=libmod.adv_ptr_var(nadf,10)

# Print outlier examples
if not libmod.is_factor(column_series):
    libmod.make_heading("A",rowptr, "Some outliers above upper fence",workbook,worksheet)
    rowptr+=1
    outlier_upper_df.head(10).to_excel(writer,sheet_name=sheetname,startrow=rowptr-1,index=False)
    rowptr+=libmod.adv_ptr_var(outlier_upper_df,10)
    
    libmod.make_heading("A",rowptr, "Some outliers below lower fence",workbook,worksheet)
    rowptr+=1
    outlier_lower_df.head(10).to_excel(writer,sheet_name=sheetname,startrow=rowptr-1,index=False)
    rowptr+=libmod.adv_ptr_var(outlier_lower_df,10)
# Rare level examples
else:
    libmod.make_heading("A",rowptr, "Some examples of rare levels",workbook,worksheet)
    rowptr+=1
    rare_exp_df.head(10).to_excel(writer,sheet_name=sheetname,startrow=rowptr-1,index=False)
    rowptr+=libmod.adv_ptr_var(rare_exp_df,10)
    
# Highlight column
highligh_format=workbook.add_format()
highligh_format.set_bg_color('orange')
worksheet.set_column(df.columns.get_loc(ColumnSelector.selected_col),df.columns.get_loc(ColumnSelector.selected_col),None,highligh_format)
    
workbook.close()

In [None]:
# Dataset report
rowptr=1

# Write output to a xlsx file named "report - Overall - Dataset" in reports folder
if not os.path.exists("reports"):
    os.makedirs("reports")
writer = pd.ExcelWriter('reports/report - Overall - Dataset.xlsx', engine='xlsxwriter')

workbook  = writer.book
worksheet = workbook.add_worksheet("Dataset")
writer.sheets["Dataset"] = worksheet

libmod.make_heading("A",rowptr, os.path.basename(FileSelector.selected_file),workbook,worksheet)
rowptr+=2

libmod.make_heading("A",rowptr, "Shape",workbook,worksheet)
rowptr+=1
worksheet.write("A{}".format(rowptr),"Row: {}".format(df.shape[0]))
worksheet.write("C{}".format(rowptr),"Column: {}".format(df.shape[1]))
rowptr+=2
    
libmod.make_heading("A",rowptr, "Column descriptions",workbook,worksheet)
rowptr+=1
df_dtype.transpose().to_excel(writer, sheet_name="Dataset",startrow=rowptr-1)
rowptr+=4

libmod.make_heading("A",rowptr, "Some examples",workbook,worksheet)
rowptr+=1
libmod.smart_sample(df,50).to_excel(writer, sheet_name="Dataset",startrow=rowptr-1,index=False)
rowptr+=libmod.adv_ptr_var(df,50)

workbook.close()

# <font color="goldenrod">Export variables, plots and data frames</font>

<h2>Global variables</h2>
<table>
    <thead>
        <tr>
            <th>Variable Name</th>
            <th>Description</th>
        </tr>
    </thead>
    <tbody>
        <tr><td>inputlib.path2files</td><td>A list of path of dataset to choose from</td></tr>
        <tr><td>df</td><td>Dataframe of dataset</td></tr>
        <tr><td>df_dtype</td><td>Dataframe of dataset columns and data type of columns</td></tr>
        <tr><td>colboxfig</td><td>Box plot of selected column (if numerical)</td></tr>
        <tr><td>colhistfig</td><td>Histogram of selected column (if numerical)</td></tr>
        <tr><td>valcount_df</td><td>Column value counts</td></tr>
        <tr><td>col_desc_df</td><td>Column descriptive statistics</td></tr>
        <tr><td>***</td><td>Placeholder</td></tr>
    </tbody>
</table>

In [None]:
output_filename="???"
shelf=shelve.open(output_filename,"n")

In [None]:
shelf["???"]=filtered_column_series

In [None]:
shelf.close()

# Do not change codes above, insert your custom codes below