## How to create a html report automatically by using ydata_profiling library

In [39]:
import pandas as pd

from ydata_profiling import ProfileReport


In [20]:
# Load the data
df = pd.read_csv("data/adult.csv",na_values='?')

df.columns = ["age", "workclass", "fnlwgt", "education", "educational-num", "marital-status", "occupation", "relationship", "race", "gender", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

# Dataset Overview
df.head() # preview a sample

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [21]:
# number of observations and features
df.shape  

(48841, 15)

In [22]:
# data types
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

In [23]:
# check duplicated rows
df[df.duplicated()]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
4880,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico,<=50K
5103,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,<=50K
9170,21,Private,250051,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,10,United-States,<=50K
11630,20,Private,107658,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,10,United-States,<=50K
13083,25,Private,195994,1st-4th,2,Never-married,Priv-house-serv,Not-in-family,White,Female,0,0,40,Guatemala,<=50K
15058,21,Private,243368,Preschool,1,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico,<=50K
17039,46,Private,173243,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
18554,30,Private,144593,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,,<=50K
18697,19,Private,97261,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States,<=50K
21317,19,Private,138153,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,10,United-States,<=50K


In [24]:
# missing values per feature
df.isna().sum()

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [25]:
# number of missing cells
df.isna().sum().sum()

6465

In [27]:
# percentage of missing cells
round(df.isna().sum().sum() / df.size * 100, 1)

0.9

In [29]:
df.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48841.0,48841.0,48841.0,48841.0,48841.0,48841.0
mean,38.643578,189666.4,10.078029,1079.045208,87.504105,40.422391
std,13.71065,105603.9,2.570965,7452.0937,403.008483,12.391571
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117555.0,9.0,0.0,0.0,40.0
50%,37.0,178147.0,10.0,0.0,0.0,40.0
75%,48.0,237646.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [38]:
# Printing the existing categories and respective frequencies for each categorical feature in data
cat_cols = ['workclass', 'education', 'educational-num',
            'marital-status', 'occupation', 'relationship', 'race',
            'gender', 'native-country', 'income']

for col in cat_cols:
    categories = df.groupby(col).size()
    print(categories, "\n")

workclass
Federal-gov          1432
Local-gov            3136
Never-worked           10
Private             33906
Self-emp-inc         1695
Self-emp-not-inc     3862
State-gov            1980
Without-pay            21
dtype: int64 

education
10th             1389
11th             1812
12th              657
1st-4th           247
5th-6th           509
7th-8th           955
9th               756
Assoc-acdm       1601
Assoc-voc        2061
Bachelors        8024
Doctorate         594
HS-grad         15784
Masters          2657
Preschool          83
Prof-school       834
Some-college    10878
dtype: int64 

educational-num
1        83
2       247
3       509
4       955
5       756
6      1389
7      1812
8       657
9     15784
10    10878
11     2061
12     1601
13     8024
14     2657
15      834
16      594
dtype: int64 

marital-status
Divorced                  6633
Married-AF-spouse           37
Married-civ-spouse       22379
Married-spouse-absent      628
Never-married            161

In [40]:

# Generate the report
profile = ProfileReport(df,title="Adult Census Profile")

# Save the report to .html
profile.to_file("adult_report.html")



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  not pdt.is_categorical_dtype(series)
  if pdt.is_categorical_dtype(series):
  if pdt.is_categorical_dtype(series):
  if pdt.is_categorical_dtype(series):
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  not pdt.is_categorical_dtype(series)
  if pdt.is_categorical_dtype(series):
  not pdt.is_categorical_dtype(series)
  if pdt.is_categorical_dtype(series):
  not pdt.is_categorical_dtype(series)
  if pdt.is_categorical_dtype(series):
  is_valid_dtype = pdt.is_categorical_dtype(series) and not pdt.is_bool_dtype(
  not pdt.is_categorical_dtype(series)
  if pdt.is_categorical_dtype(series):
  if pdt.is_categorical_dtype(series):
  if pdt.is_categorical_dtype(series):
  is_valid_dtype = pdt.is_categorical_dtype(series) and not 

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]