In [1]:
import pandas as pd
import dataprofile
from dataprofile import ProfileReport

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
dir(dataprofile)

['ProfileReport',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_config',
 '_monitor',
 '_profiling',
 '_var_statistics',
 'get_df_profile',
 'get_var_summary',
 'render_report',
 'reporting']

In [4]:
df = pd.read_csv('../data/titanic/train.csv')
df.shape

(891, 12)

In [5]:
report = ProfileReport()

In [6]:
report.get_params()

{'num_works': -1, 'random_state': 0, 'sample_size': -1, 'var_per_row': 6}

In [7]:
report.fit(df)

2020-05-04 at 19:42:44|INFO|Profiling variables: 100%|████████████████████████████████████████12/12


ProfileReport(num_works=-1, random_state=0, sample_size=-1, var_per_row=6)

In [8]:
report.show_report()


This following report is created by Gordon Chen on Monday, May 04, 2020

+------------------+---------+
|                  |   count |
|------------------+---------|
| n_row            |     891 |
| n_col            |      12 |
| n_missing_cell   |     866 |
| n_empty_row      |       0 |
| n_duplicated_row |       0 |
| n_Useless_var    |       2 |
| n_Binary_var     |       2 |
| n_Nominal_var    |       3 |
| n_Interval_var   |       5 |
+------------------+---------+


+-------------+----------+-------------+---------+-------------+-------------+------------+------------+
|             | type     | data_type   |   count |   n_missing | p_missing   |   n_unique | p_unique   |
|-------------+----------+-------------+---------+-------------+-------------+------------+------------|
| PassengerId | Useless  | Unique      |     891 |           0 | 0.00%       |        891 | 100.00%    |
| Name        | Useless  | Unique      |     891 |           0 | 0.00%       |        891 | 100.00%  

In [9]:
report.save_report('test.html')

Report saved to test.html


In [10]:
profile_dict = report.df_profile
profile_dict.keys()

dict_keys(['table_stats', 'var_summary', 'var_stats', 'conf_matrix'])

In [11]:
profile_dict['table_stats']

Unnamed: 0,count
n_row,891
n_col,12
n_missing_cell,866
n_empty_row,0
n_duplicated_row,0
n_Useless_var,2
n_Binary_var,2
n_Nominal_var,3
n_Interval_var,5


In [12]:
profile_dict['var_summary']

Unnamed: 0,type,data_type,count,n_missing,p_missing,n_unique,p_unique
PassengerId,Useless,Unique,891,0,0.00%,891,100.00%
Name,Useless,Unique,891,0,0.00%,891,100.00%
Survived,Binary,Numerical,891,0,0.00%,2,0.22%
Sex,Binary,Categorical,891,0,0.00%,2,0.22%
Ticket,Nominal,Categorical,891,0,0.00%,681,76.43%
Cabin,Nominal,Categorical,891,687,77.10%,147,72.06%
Embarked,Nominal,Categorical,891,2,0.22%,3,0.34%
Pclass,Interval,Numerical,891,0,0.00%,3,0.34%
Parch,Interval,Numerical,891,0,0.00%,7,0.79%
SibSp,Interval,Numerical,891,0,0.00%,7,0.79%


In [13]:
for k,y in profile_dict['var_stats'].items():
    print(k)
    print(y.index)

Useless
Index(['PassengerId', 'Name'], dtype='object')
Binary
Index(['Survived', 'Sex'], dtype='object')
Nominal
Index(['Ticket', 'Cabin', 'Embarked'], dtype='object')
Interval
Index(['Pclass', 'Parch', 'SibSp', 'Fare', 'Age'], dtype='object')


# convert to pdf

In [15]:
report.df_profile.keys()

dict_keys(['table_stats', 'var_summary', 'var_stats', 'conf_matrix'])

In [16]:
from dataprofile.reporting import profile_to_str

In [36]:
str_ = profile_to_str(report.df_profile,table_fmt = 'html',
        line_breaker = '<br>')
#str_ = '<br>'.join(str_)

In [42]:
str_[3]

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>n_row</th>\n      <td>891</td>\n    </tr>\n    <tr>\n      <th>n_col</th>\n      <td>12</td>\n    </tr>\n    <tr>\n      <th>n_missing_cell</th>\n      <td>866</td>\n    </tr>\n    <tr>\n      <th>n_empty_row</th>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>n_duplicated_row</th>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>n_Useless_var</th>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>n_Binary_var</th>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>n_Nominal_var</th>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>n_Interval_var</th>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>'

In [45]:
tmp = '<th></th>\n      <th>count</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>n_row</th>\n      <td>891</td>\n    </tr>\n    <tr>\n      <th>n_col</th>\n      <td>12</td>\n    </tr>\n    <tr>\n      <th>n_missing_cell</th>\n      <td>866</td>\n    </tr>\n    <tr>\n      <th>n_empty_row</th>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>n_duplicated_row</th>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>n_Useless_var</th>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>n_Binary_var</th>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>n_Nominal_var</th>\n      <td>3</td>\n    </tr>\n    <tr>\n      <th>n_Interval_var</th>\n      <td>5</td>\n    </tr>\n  </tbody>\n</table>'

In [23]:
from fpdf import FPDF, HTMLMixin

In [43]:
class HTML2PDF(FPDF, HTMLMixin):
    pass


pdf = HTML2PDF()
pdf.set_font("Arial", size=12)


In [44]:
pdf.add_page()
pdf.write_html(str_[3])

RuntimeError: Table column/cell width not specified, unable to continue

In [40]:
for item in str_:
    pdf.add_page()
    pdf.write_html(item)

RuntimeError: Table column/cell width not specified, unable to continue

In [27]:
pdf.output('simple_table_html.pdf')

''

In [28]:
str_[0]

