**Catch categorical, numerical and cardinal variables**

In [5]:
def grab_col_names(dataframe, cat_th=10, car_th=20):

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f'Observations: {dataframe.shape[0]}')
    print(f'Variables: {dataframe.shape[1]}')
    print(f'Categorical Variables: {len(cat_cols)}')
    print(f'Numerical Variables: {len(num_cols)}')
    print(f'Numerical But Categorical Variables: {len(num_but_cat)}')
    print(f'Categorical But Cardinal: {len(cat_but_car)}')
    
    return cat_cols, num_cols, cat_but_car

**Summarize the categorical variables**

In [11]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                       'Ratio': round(100*(dataframe[col_name].value_counts()) / len(dataframe), 2)}))
    
    if plot:
        plt.figure(figsize=(5, 3))
        sns.countplot(x=col_name, data=dataframe)
        plt.xticks(rotation=90)
        plt.show()
        
    print(100*'#')

**Summarize the numerical variables**

In [3]:
def num_summary(dataframe, numerical_col, plot=False):
    print(numerical_col.upper())
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    print(dataframe[numerical_col].describe(quantiles).T)
    
    if plot:
        sns.histplot(data=dataframe, x=numerical_col)
        plt.title(numerical_col)
        plt.xlabel(numerical_col)
        plt.show()
       
    print(100*'#')

**Target Variable Analysis**

In [6]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({'Target Mean': dataframe.groupby(categorical_col)[target].mean()}))
    print(100*'#')

**Outlier Analysis**

In [10]:
def outlier_threshold(dataframe, col_name, q1=0.05, q3=0.95):
    
    quartile_1 = dataframe[col_name].quantile(q1)
    quartile_3 = dataframe[col_name].quantile(q3)
    interquartile = quartile_3 - quartile_1
    low_limit = quartile_1 - 1.5 * interquartile
    up_limit = quartile_3 + 1.5 * interquartile
    
    return up_limit, low_limit

In [11]:
def check_outlier(dataframe, col_name):
    
    up_limit, low_limit = outlier_threshold(dataframe, col_name)
    
    if dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].any(axis=None):
        return True
    else:
        return False

In [6]:
def grap_outliers(dataframe, col_name, index=False):
    
    low_limit, up_limit = outlier_threshold(dataframe, col_name)
    
    if dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].shape[0] > 10:
        return dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].head()
    else:
        return dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)]
        
    if index == True:
        outlier_index = dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].index
        return outlier_index

In [7]:
def catch_outliers(dataframe, col, return_df=False):
    outliers = outlier_threshold(dataframe, col, q1=0.05, q3=0.95)
    outliers_df = dataframe[(dataframe[col] > outliers[0]) | (dataframe[col] < outliers[1])]
    print(f'{col} of total outliers: {len(outliers_df)} rows')
    if return_df == True:
        return outliers_df

In [13]:
def replace_thresholds(dataframe, col_name):
    up_limit, low_limit = outlier_threshold(dataframe, col_name)
    dataframe.loc[dataframe[col_name] < low_limit, col_name] = low_limit
    dataframe.loc[dataframe[col_name] > up_limit, col_name] = up_limit

**Missing Value**

In [14]:
def missing_value_table(dataframe, na_name=False):
    
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df)
    
    if na_name == True:
        return na_columns