# Target summary with cat

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
df = sns.load_dataset('titanic')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

# Cat Summary Function

In [16]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                       'Ratio': round(100*(dataframe[col_name].value_counts()) / len(dataframe), 2)}))
    
    if plot:
        sns.countplot(x=col_name, data=dataframe)
        plt.show()
        
    print('######################################################################\n')

# Num Summary Funciton

In [17]:
def num_summary(dataframe, numerical_col, plot=False):
    print(numerical_col.upper())
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    print(dataframe[numerical_col].describe(quantiles).T)
    
    if plot:
        dataframe[numerical_col].hist()
        plt.title(numerical_col)
        plt.xlabel(numerical_col)
        plt.show()
       
    print('################################################################\n')

# Grab All Columns

In [18]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    
    cat_cols = [col for col in df.columns if str(df[col].dtype) in ['category', 'object', 'bool']]
    num_but_cat = [col for col in df.columns if str(df[col].dtype) in ['int64', 'float64'] and df[col].nunique() < cat_th]
    cat_but_car = [col for col in df.columns if str(df[col].dtype) in ['category', 'object'] and df[col].nunique() > car_th]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    
    num_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
    num_cols = [col for col in num_cols if col not in cat_cols]
    
    print(f'Observations: {dataframe.shape[0]}')
    print(f'Variables: {dataframe.shape[1]}')
    print(f'Categorical Variables: {len(cat_cols)}')
    print(f'Numerical Variables: {len(num_cols)}')
    print(f'Numerical But Categorical Variables: {len(num_but_cat)}')
    print(f'Categorical But Cardinal: {len(cat_but_car)}')
    
    return cat_cols, num_cols, cat_but_car

In [19]:
cat_cols_df, num_cols_df, cat_but_car_df = grab_col_names(df)

Observations: 891
Variables: 15
Categorical Variables: 13
Numerical Variables: 2
Numerical But Categorical Variables: 4
Categorical But Cardinal: 0


# Target Summary With Cat

In [24]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({'Target Mean': dataframe.groupby(categorical_col)[target].mean()}))
    print('################################################################\n')

In [25]:
for col in cat_cols_df:
    target_summary_with_cat(df, 'survived', col)

        Target Mean
sex                
female     0.742038
male       0.188908
################################################################

          Target Mean
embarked             
C            0.553571
Q            0.389610
S            0.336957
################################################################

        Target Mean
class              
First      0.629630
Second     0.472826
Third      0.242363
################################################################

       Target Mean
who               
child     0.590361
man       0.163873
woman     0.756458
################################################################

            Target Mean
adult_male             
False          0.717514
True           0.163873
################################################################

      Target Mean
deck             
A        0.466667
B        0.744681
C        0.593220
D        0.757576
E        0.750000
F        0.615385
G        0.500000
###########################

# Target Summary With Num

In [39]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: 'mean'}))

In [40]:
target_summary_with_num(df, 'survived', 'age')

                age
survived           
0         30.626179
1         28.343690


In [41]:
for col in num_cols_df:
    target_summary_with_num(df, 'survived', col)

                age
survived           
0         30.626179
1         28.343690
               fare
survived           
0         22.117887
1         48.395408
