In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as st

from IPython.display import display
%matplotlib inline

In [None]:
customer_df = pd.read_csv('Wholesale_customers_data.csv')
customer_df.drop(['Channel', 'Region'], axis=1, inplace=True)

In [None]:
customer_samp_df = customer_df.sample(5)

In [None]:
samp_stats = customer_samp_df.describe().T
samp_stats['skew'] = st.skew(customer_samp_df)
samp_stats['kurt'] = st.kurtosis(customer_samp_df)
samp_stats

In [None]:
stats = customer_df.describe().T
stats['skew'] = st.skew(customer_df)
stats['kurt'] = st.kurtosis(customer_df)
stats

---

# MANY OF THE TOOLS WE WILL USE WILL ASSUME NORMAL DATA

---

You are already familiar with standardization.

## $$Z = \frac{X-\mu}{\sigma}$$

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
customer_sc = scaler.fit_transform(customer_df)
customer_sc_df = pd.DataFrame(customer_sc, columns=customer_df.columns)

sc_stats = customer_sc_df.describe().T
sc_stats['skew'] = st.skew(customer_sc_df)
sc_stats['kurt'] = st.kurtosis(customer_sc_df)
display(stats)
display(sc_stats)

# Visualizing the Data

In [None]:
fig = plt.figure(figsize=(20,6))
for i, col in enumerate(customer_df.columns):
    fig.add_subplot(231+i)
    sns.distplot(customer_df[col], label=col)
    plt.axvline(customer_df[col].mean(), c='red')
    plt.axvline(customer_df[col].median(), c='black')
    plt.legend()

In [None]:
fig = plt.figure(figsize=(20,6))
for i, col in enumerate(customer_sc_df.columns):
    fig.add_subplot(231+i)
    sns.distplot(customer_sc_df[col], label=col)
    plt.axvline(customer_sc_df[col].mean(), c='red')
    plt.axvline(customer_sc_df[col].median(), c='black')
    plt.legend()

## MANY OF THE TOOLS WE WILL USE WILL ASSUME NORMAL DATA

---

# Deskew the Data

We will look at two common approaches to deskewing data:

- the log transform
- scaling by the Box-Cox test

For purposes of comparison, we will keep both transforms.

We have previously looked at scaling data as a preprocessing step. Note that scaling of data will have no effect on its skewness. 

Another way we can verify this is via a test of skewness.

To perform this test we can use the `scipy.stats.skewtest`.

This function tests the null hypothesis that the skewness of the population that the sample was drawn from is the same as that of a corresponding normal distribution. Remember, a low p-value means reject the null hypothesis i.e the data is skewed!


In [None]:
import scipy.stats as st

In [None]:
for col in customer_sc_df.columns:
    original_col_skewtest = st.skewtest(customer_df[col])
    scaled_col_skewtest = st.skewtest(customer_sc_df[col])
    print("{}\norig skew test: {} \nscaled skew test: {}\n\n".format(col, 
                                                                     original_col_skewtest,
                                                                     scaled_col_skewtest))

## Deskew by taking the log of the data

Many times the skew of data can be easily removed by taking the log of the data. Let's do so here.

We will then scale the data after deskewing.

In [None]:
customer_log_df = np.log(customer_df)

scaler.fit(customer_log_df)
customer_log_sc = scaler.transform(customer_log_df)
customer_log_sc_df = pd.DataFrame(customer_log_sc, columns=customer_df.columns)

In [None]:
for col in customer_log_df.columns:
    original_col_skewtest = st.skewtest(customer_df[col])
    scaled_col_skewtest = st.skewtest(customer_sc_df[col])
    original_log_col_skewtest = st.skewtest(customer_log_df[col])
    scaled_log_col_skewtest = st.skewtest(customer_log_sc_df[col])
    print("""{}
    orig:       {} 
    scaled:     {}
    orig log:   {}
    scaled log: {}
    
    """.format(col, 
               original_col_skewtest,
               scaled_col_skewtest, 
               original_log_col_skewtest,
               scaled_log_col_skewtest))

In [None]:
fig = plt.figure(figsize=(20,6))
for i, col in enumerate(customer_sc_df.columns):
    fig.add_subplot(231+i)
    sns.distplot(customer_sc_df[col], label=col)
    plt.axvline(customer_sc_df[col].mean(), c='red')
    plt.axvline(customer_sc_df[col].median(), c='black')
    plt.legend()

In [None]:
fig = plt.figure(figsize=(20,6))
for i, col in enumerate(customer_log_sc_df.columns):
    fig.add_subplot(231+i)
    sns.distplot(customer_log_sc_df[col], label=col)
    plt.axvline(customer_log_sc_df[col].mean(), c='red')
    plt.axvline(customer_log_sc_df[col].median(), c='black')
    plt.legend()

In [None]:
customer_log_sc_df.to_pickle('final_log_sc.p')

---

## Deskew by Box-Cox Test

The box cox test works by identifying the optimum power, $\lambda$ to raise the data where

$$\mathbf{x_i}' = \frac{\mathbf{x_i}^\lambda -1}{\lambda}$$

![](https://www.evernote.com/l/AAFyfeEXifNCb6PDsM7FKA6-EZIKf9MQBsgB/image.png)

In [None]:
customer_box_cox_df = pd.DataFrame()
for col in customer_df.columns:
    box_cox_trans = st.boxcox(customer_df[col])[0]
    customer_box_cox_df[col] = pd.Series(box_cox_trans)

In [None]:
scaler.fit(customer_box_cox_df)
customer_box_cox_sc = scaler.transform(customer_box_cox_df)
customer_box_cox_sc_df = pd.DataFrame(customer_box_cox_sc, columns=customer_df.columns)

In [None]:
for col in customer_log_df.columns:
    original_col_skewtest = st.skewtest(customer_df[col])
    scaled_col_skewtest = st.skewtest(customer_sc_df[col])
    original_log_col_skewtest = st.skewtest(customer_log_df[col])
    scaled_log_col_skewtest = st.skewtest(customer_log_sc_df[col])
    original_box_cox_col_skewtest = st.skewtest(customer_box_cox_df[col])
    scaled_box_cox_col_skewtest = st.skewtest(customer_box_cox_sc_df[col])
    print("""{}
    orig:           {} 
    scaled:         {}
    orig log:       {}
    scaled log:     {}
    orig box-cox:   {}
    scaled box-cox: {}
    
    """.format(col, 
               original_col_skewtest,
               scaled_col_skewtest, 
               original_log_col_skewtest,
               scaled_log_col_skewtest, 
               original_box_cox_col_skewtest,
               scaled_box_cox_col_skewtest))

In [None]:
fig = plt.figure(figsize=(20,6))
for i, col in enumerate(customer_box_cox_sc_df.columns):
    fig.add_subplot(231+i)
    sns.distplot(customer_box_cox_sc_df[col], label=col)
    plt.axvline(customer_box_cox_sc_df[col].mean(), c='red')
    plt.axvline(customer_box_cox_sc_df[col].median(), c='black')
    plt.legend()

In [None]:
customer_box_cox_sc_df.to_pickle('final_box_cox_sc.p')