In [1]:
import pandas as pd
import numpy as np

import plotly.express as px

import cufflinks as cf
cf.set_config_file(sharing='private',
                   colorscale='brbg',
                   offline=True,
                   offline_show_link=False,
                   theme='ggplot')

In [2]:
# raw_train.info()

# raw_train.describe(include='object').T

# raw_train.describe(exclude='object').T

In [3]:
def show_missings(df):

    missings = df.isna().sum().loc[df.columns[df.isna().any(
        axis=0)]].rename('total').sort_values(ascending=False)

    if len(missings) == 0:
        print('There is no one missing value')

    else:
        fig = px.bar(missings,
                     labels={'value': "nan's count",
                             'index': 'Feature'},
                     color_discrete_sequence=px.colors.qualitative.Prism,
                     title='Missing values ratio',
                     range_y=(0, len(df)))
        fig.show()

In [4]:
def heatmap(df, target, bound=None, method='pearson'):

    if bound == None:
        bound = sum(
            ~df.dtypes.isin(['object', 'category', 'string', 'boolean']))

    corr = df.corr(method=method)
    corr_abs = corr.abs()

    if bound > 1:
        cols = corr_abs.nlargest(bound, target)[target].index

    elif 0 < bound < 1:
        cols = corr_abs.columns[corr_abs[target] >= bound]

    df[cols].corr().sort_values(
        target, axis=1, ascending=False, key=abs).sort_values(
            target, axis=0, ascending=False, key=abs).iplot(
                kind='heatmap',
                center_scale=0,
                title=f'TOP {len(cols)} correlations')

In [5]:
def next_scatter(df, col_pairs:list, third_col=None):
    """Use list of tuples of lists"""

    for pair in col_pairs:
        fig = px.scatter(df,
                         x=pair[0],
                         y=pair[1],
                         color=third_col,
                         trendline="ols",
                         color_discrete_sequence=px.colors.qualitative.Prism,
                         color_continuous_scale=px.colors.sequential.Aggrnyl,
                         hover_name=["Index " + str(i) for i in df.index],
                         title=f'''Correlation between {pair[0]} and {pair[1]}
    with coefficient {df[pair[0]].corr(df[pair[1]]):.2f}''')
        fig.show()
        if len(col_pairs) == 1:
            pass
        else:
            yield

In [2]:
def next_hist(df, cols:list):

    for next_col in cols:
        fig = px.histogram(df,
                       x=next_col,
                       hover_name=["Index " + str(i) for i in df.index],
                           color_discrete_sequence=px.colors.qualitative.Prism,
                       title=f'Histogram of {next_col}',
                       opacity=0.8)
        fig.show()

        if df[next_col].dtype not in ['object', 'str', 'bool']:
            print(
                f'Skewness {df[next_col].skew():.2f}\nKurtosis {df[next_col].kurt():.2f}')
        if len(cols) == 1:
            pass
        else:
            yield

In [13]:
def next_violin(df, cols: list, target):

    for next_col in cols:

        fig = px.violin(df,
                    y=target,
                    x=next_col,
                    box=True,
                        hover_name=["Index " + str(i) for i in df.index],
                    points='suspectedoutliers',
                        color_discrete_sequence=px.colors.qualitative.Prism,
                    title=f'{target} distribution for {next_col} feature')

        fig.show()
        if len(cols) == 1:
            pass
        else:
            yield