In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# move directory
import os
colab_dir = "./drive/MyDrive/"
os.chdir(colab_dir)

In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline
%matplotlib inline

In [None]:
# pip install
##  pandas profiling
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip    
from pandas_profiling import ProfileReport

In [None]:
# set random seed
import random
random.seed(335)

In [None]:
# magic word
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
# for better viz
import pprint
import warnings
warnings.filterwarnings('ignore')

### reference
-------------------

- [pandas cheat sheet](https://github.com/pandas-dev/pandas/tree/master/doc/cheatsheet)
- [numpy cheat sheet(data camp)](https://www.datacamp.com/community/blog/python-numpy-cheat-sheet)
- [scikit-learn cheat sheet(data camp)](datacamp.com/community/blog/scikit-learn-cheat-sheet)

# data understanding
------------
The data understanding phase starts with an initial data collection and proceeds withactivities in order to get familiar with the data, to identify data quality problems, to discover first insights into the data or to detect interesting subsets to form hypothesesfor hidden information.

## collect initial data
----------

### task

Acquire within the project the data (or access to the data) listed in the
project resources. This initial collection includes data loading if necessary
for data understanding. For example, if you apply a specific tool for data
understanding, it makes perfect sense to load your data into this tool.
This effort possibly leads to initial data preparation steps.

Note: if you acquire multiple data sources, integration is an additional
issue, either here or in the later data preparation phase.

### output

List the dataset (or datasets) acquired, together with their locations
within the project, the methods used to acquire them and any problems
encountered. Record problems encountered and any solutions achieved
to aid with future replication of this project or with the execution of
similar future projects.

In [None]:
# read csv file
df = pd.read_csv(path + file_name)

In [None]:
# read excel file
df = pd.read_excel(path + file_name, sheet_name=sheet_name)

## describe data
----------

### task

Examine the “gross” or “surface” properties of the acquired data and
report on the results.

### output

Describe the data which has been acquired, including: the format of
the data, the quantity of data, for example number of records and fields
in each table, the identities of the fields and any other surface features
of the data which have been discovered. Does the data acquired satisfy
the relevant requirements?

In [None]:
# number of records and fields
df.shape # (records, fields)

In [None]:
# head
df.head(n=5)

In [None]:
# tail
df.tail(n=5)

In [None]:
# data types
df.dtypes

## explore data
----------

### task

This task tackles the data mining questions, which can be addressed
using querying, visualization and reporting. These include: distribution
of key attributes, for example the target attribute of a prediction task;
relations between pairs or small numbers of attributes; results of
simple aggregations; properties of significant sub-populations; simple
statistical analyses. These analyses may address directly the data mining goals; they may also contribute to or refine the data description
and quality reports and feed into the transformation and other data
preparation needed for further analysis.

### output

Describe results of this task including first findings or initial hypothesis and their impact on the remainder of the project. If appropriate,
include graphs and plots, which indicate data characteristics or lead
to interesting data subsets for further examination.

In [None]:
# convert data type
df.shipping_date = pd.to_datetime(df.shipping_date, format='%Y/%m/%d')
df.price = df.price.astype('int64')
df.quantity = df.quantity.astype('int64')

In [None]:
# describe
df.describe()

In [None]:
# histogram
sns.distplot(df.price)

In [None]:
# histogram
sns.distplot(df.quantity)

In [None]:
# random data wrangling
data = df.groupby('shipping_date', as_index=False).agg({'price': np.sum, 'quantity': np.sum})
sns.lineplot(data=data, x="shipping_date", y="price")

In [None]:
# pandas profiling
profile = ProfileReport(df, minimal=True)
profile.to_file(path + "data_understanding_report.html")
profile

## verify data quality
----------

### task

Examine the quality of the data, addressing questions such as: is the
data complete (does it cover all the cases required)? Is it correct or
does it contain errors and if there are errors how common are they?
Are there missing values in the data? If so how are they represented,
where do they occur and how common are they?

### output

List the results of the data quality verification; if quality problems
exist, list possible solutions. Solutions to data quality problems
generally depend heavily on both data and business knowledge.

In [None]:
# check duplicate row
df[df.duplicated(keep=False)]

In [None]:
# check outlier
from scipy import stats
z_thr = 3.0
df[(np.abs(stats.zscore(df.select_dtypes(include=int))) > z_thr).any(axis=1)]

In [None]:
# check missing data
df.isnull().sum()

In [None]:
# check number of unique value
df.nunique()

In [None]:
# check unique value
for feature in df.columns:
  print('------' + feature + '------')
  print(np.sort(df[feature].unique()))
  print()

## note/questions
-------------

#### collect initial data


#### describe data



#### explore data



#### verify data quality


