In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, r2_score
from catboost import Pool, CatBoostClassifier, CatBoostRegressor

pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
input_path = '../../data/input/'
output_path = '../../data/output/'
input_filename = 'marketing_campaign.csv'

report_dttm = None

### Getting the data

the data is coming from https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis

In [None]:
df = pd.read_csv(input_path+input_filename, sep='\t')
print(df.shape)
print(np.round(df.memory_usage(deep=True).sum() / 1024**2, 2))
df.head()

# df.to_parquet(f'{output_path}customer_segmentation.parquet.gzip', compression='gzip')
# df = pd.read_parquet(f'{output_path}customer_segmentation.parquet.gzip', engine='fastparquet')

In [None]:
pd.concat([df.isnull().sum(),df.dtypes], axis=1)

### Preprocessing (analysis)

In [None]:
# ARRAY-type columns come from BQ as an 'object', so it's necessary to specify them manually
# if there are some

array_cols_list = [] # ban_type
print('All columns are in the DataFrame: ', len(array_cols_list) > 0 and len(set(array_cols_list).intersection(set(df.columns))) == len(array_cols_list))

In [None]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format="%d-%m-%Y")

#### Replace empty strings with None

In [None]:
%%time 

# replaces empty string "" with NULL
for col in df.select_dtypes(include='O').columns:
    df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
    
print(np.round(df.memory_usage(deep=True).sum() / 1024**2, 2))

#### Drop useless columns

In [None]:
%%time 

# deletes entirely blank columns
empty_cols = list(set(df.columns) - set(df.dropna(axis=1, how='all').columns))
one_val_cols = [col for col in df.columns if df[col].nunique() == 1]
if empty_cols:
    print('These columns only contain NULLs:\n', ', '.join(empty_cols))
    df = df.drop(empty_cols, axis=1)

# deletes the columns consisting of only one value
if one_val_cols:
    print('These columns contain a single value:\n', ', '.join(one_val_cols))
    df = df.drop(one_val_cols, axis=1)

print(np.round(df.memory_usage(deep=True).sum() / 1024**2, 2))

#### Datatypes optimization, reducing memory usage *(optional)*

In [None]:
def get_subtype_limits(type_name, subtype_name):
    """
    returns min and max values according to the data type
    """
    subtype_name = subtype_name.lower()
    if 'int' in type_name.lower():
        return (np.iinfo(subtype_name).min, np.iinfo(subtype_name).max)
    elif 'float' in type_name.lower():
        return (np.finfo(subtype_name).min, np.finfo(subtype_name).max)


def reduce_memory(data):
    df = data.copy()
    print(np.round(df.memory_usage(deep=True).sum() / 1024**2, 2))
    subtypes_dict = {
        'int64': ['UInt8','UInt16','UInt32','UInt64','Int8','Int16','Int32'],
        'float64': ['float16','float32'],
        'object': ['category']
    }

    for col in df.select_dtypes(include='number').columns:
        col_type = str(df[col].dtypes).lower()
        min_column_value = df[col].min()
        max_column_value = df[col].max()

        for subtype in subtypes_dict[col_type]:
            min_subtype_value, max_subtype_value = get_subtype_limits(col_type, subtype)
            if min_column_value>min_subtype_value and max_column_value<max_subtype_value:
                df[col] = df[col].astype(subtype)
                break

    for col in df.select_dtypes(include=['object']).columns:
            if not type(df.loc[data[col].notnull().index[0],col]) in (np.ndarray, list):
                df[col] = df[col].astype('category')
    print(np.round(df.memory_usage(deep=True).sum() / 1024**2, 2))
    return df

In [None]:
%%time 
df_reduced = reduce_memory(df)

In [None]:
df_reduced.to_pickle(f'{output_path}df_reduced.pkl')
df_reduced = pd.read_pickle(f'{output_path}df_reduced.pkl')

#### Changing datatypes to numeric (except "object"/"category")

In [None]:
report_dttm = df['Dt_Customer'].max().to_pydatetime()

In [None]:
def df_date_to_int(data, report_dttm=report_dttm, unit_muliplier=31):
    """
    translating a date into the number of months (days * unit_muliplier) from that date
    """
    dttm_cols = data.select_dtypes(include=['datetime64[ns, UTC]','datetime64[ns]']).columns
    if dttm_cols.tolist():
        for col in dttm_cols:
            data[col] = ((pd.to_datetime(report_dttm.date()) - df['Dt_Customer']).dt.days / unit_muliplier).astype('int64')

    # dt_cols = data.drop(dttm_cols, axis=1).select_dtypes(include='dbdate').columns
    # for col in dt_cols:
    #     data[col] = (report_dttm.date() - pd.to_datetime(data[col]).dt.date).dt.days
    print('df_date_to_int: Done')

def df_bool_to_int(data):
    """
    translating a bool into the number
    """
    cols = data.select_dtypes(include='bool').columns
    if cols.tolist():
        for col in cols:
            data[col] = data[col].astype('UInt8')
    print('df_bool_to_int: Done')

def df_array_cols_parse(data, array_columns_list=None):
    """
    creates separate columns from array elements
    assigns 1 to the value in the column if there is a corresponding element in the array, otherwise 0
    """
    if array_columns_list:
        for col in array_columns_list:
            unique_elements_list = list(set([x for xs in data[col].tolist() for x in xs if x.strip() != '']))
            for elem in unique_elements_list:
                col_name = f'is_{elem}_ban_type'
                data[col_name] = 0
                idx_to_mark = data[data[col].apply(lambda _: elem in _)].index
                data.loc[idx_to_mark, col_name] = 1
        data.drop(array_columns_list, axis=1, inplace=True)
    print('df_array_cols_parse: Done')


In [None]:
%%time 

df_date_to_int(df)
df_bool_to_int(df)
df_array_cols_parse(df)
df.head()

### EDA

In [None]:
df.describe()

#### Outliers

In [None]:
# 2 options
feature='Year_Birth'

# based on IQR
# Q1 = df[feature].quantile(.25)
# Q3 = df[feature].quantile(.75)
# IQR = Q3 - Q1
# bounds = (Q1-1.5*IQR, Q3+1.5*IQR)

# based on cutting 2% from each side
bounds = df[feature].quantile([.02,.98]).values

# replace with None, fill it later
df.loc[df[(df[feature]<=min(bounds)) | (df[feature]>=max(bounds))].index, feature] = None

#### Correlation check

In [None]:
corr=df.drop(columns='ID').select_dtypes(include='number').corr()
corr.style.background_gradient()

In [None]:
corr_dict = {'column1':[], 'column2':[], 'corr_coef':[]}
thrshld = .85 # threshold after which we consider columns as highly correlated

for col in corr.columns:
    high_corr_ftrs = corr[abs(corr[col]) >= thrshld].index
    if list(high_corr_ftrs):
        for col2 in high_corr_ftrs:
            if col2 != col and col2 not in corr_dict['column1']:
                corr_coef = np.round(corr.loc[col2, col] * 100,2)
                corr_dict['column1'].append(col)
                corr_dict['column2'].append(col2)
                corr_dict['corr_coef'].append(corr_coef)

pd.DataFrame(corr_dict).sort_values('corr_coef', ascending=False)

In [None]:
cols_to_hide = set(corr_dict['column2'])
df.drop(columns=cols_to_hide, inplace=True)

#### Dealing with NULLs

##### Replacing with a zero/ average/ mode/

In [None]:
fig, ax = plt.subplots(1,5,figsize=(15, 4), sharey=True)
feature = 'Income'

for _ in range(0,5):
    ax[_].grid(axis='y')

sns.kdeplot(df[feature], ax=ax[0])
sns.kdeplot(df[feature].fillna(0), ax=ax[1])
sns.kdeplot(df[feature].fillna(df[feature].mean()), ax=ax[2])
sns.kdeplot(df[feature].fillna(df[feature].median()), ax=ax[3])
sns.kdeplot(df[feature].fillna(df[feature].mode()[0]), ax=ax[4])

ax[0].title.set_text('Original')
ax[1].title.set_text('Zero')
ax[2].title.set_text('Average')
ax[3].title.set_text('Median')
ax[4].title.set_text('Mode');

In [None]:
filling_dict = {
    'mean':['Income'],
    'mode':[],
    'zero':[],
    'max':[]
}

In [None]:
def df_fill_null(df, filling_dict, max_value=9999):
    data = df.copy()
    """
    fills NULLs with zeroes/ average/ mode/ etc.
    """
    if 'mean' in filling_dict.keys() and filling_dict['mean']:
        cols_to_mean = list(set(filling_dict['mean']).intersection(set(data.columns)))
        if cols_to_mean:
            for col in cols_to_mean:
                mean = data[col].astype('float64').mean()
                if np.isnan(mean):
                    mean = round(data[col].astype('float64').mean())
                else:
                    mean = round(mean)
                data[col] = data[col].fillna(mean)

    if 'mode' in filling_dict.keys() and filling_dict['mode']:
        cols_to_mode = list(set(filling_dict['mode']).intersection(set(data.columns)))
        if cols_to_mode:
            for col in cols_to_mode:
                mode = data[col].mode()
                if mode.size > 0:
                    data[col] = data[col].fillna(mode[0])
                else:
                    print(f'Column {col} has no mode!')

    if 'zero' in filling_dict.keys() and filling_dict['zero']:
        cols_to_zero = list(set(filling_dict['zero']).intersection(set(data.columns)))
        if cols_to_zero:
            data = data.fillna(dict(zip(cols_to_zero,[0]*len(cols_to_zero))))

    if 'max' in filling_dict.keys() and filling_dict['max']:
        cols_to_max = list(set(filling_dict['max']).intersection(set(data.columns)))
        if cols_to_max:
            data = data.fillna(dict(zip(cols_to_max,[max_value]*len(cols_to_max))))

    return data

In [None]:
df = df_fill_null(df, filling_dict)

##### Filling with ML

In [None]:
col_to_fill = 'Income' # one of the columns containing NULLs to be filled
temp_df = pd.concat([
    df[df.columns[~df.isnull().any()]], # columns with no NULLs
    df[col_to_fill] 
], axis=1).drop(columns=['ID'])

category_cols = temp_df.select_dtypes(include='category').columns.tolist()
if category_cols:
    for col in tqdm(category_cols):
        temp_df[col] = temp_df[col].astype(str)

df_notnull = temp_df[temp_df[col_to_fill].notnull()]
df_null = temp_df[temp_df[col_to_fill].isnull()].drop(columns=col_to_fill)

print(df_notnull.shape)
print(df_null.shape)

In [None]:
X,y = df_notnull.drop(columns=col_to_fill), df_notnull[col_to_fill]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

cat_features = X.select_dtypes(include='O').columns.tolist()

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)
fill_pool = Pool(data=df_null, cat_features=cat_features)

In [None]:
model = CatBoostRegressor(
    iterations=10,
    depth=2,
    learning_rate=.5,
    loss_function='RMSE',
    verbose=False
)

model.fit(train_pool)
y_pred = model.predict(test_pool)
print(r2_score(y_test, y_pred))

In [None]:
df.loc[df_null.index, 'Income'] = model.predict(fill_pool)

#### Plots

In [None]:
sns.pairplot(df.drop('ID', axis=1), corner=True, plot_kws = {'s': 1, 'alpha': 0.3});

In [None]:
df.columns

In [None]:
sns.catplot(x=pd.qcut(df['Income'], 6, duplicates='drop').astype(str), y=df['Teenhome'], s=5, alpha=.2);

In [None]:
sns.boxplot(x=df['Teenhome'], hue=pd.qcut(df['Income'], 6, duplicates='drop'), showfliers=False);

In [None]:
sns.boxplot(y=df['Income'], x=df['Dt_Customer'], showfliers=False);

### Preprocessing (clustering)

#### Splitting data to categories (segments) *(optional)*

In [None]:
%%time

elements_thrshld = 8 # don't split to segments columns having less than <elements_thrshld> unique elements
n_segments = 5 # how many segments we want the data to be split to

cols_to_segment = [col for col in df.drop(columns=['ID'])._get_numeric_data().columns if df[col].nunique() > elements_thrshld]
for col in cols_to_segment:
    new_colname = f'{col}_segmented'
    df[new_colname] = pd.cut(df[col], n_segments).astype('category')

print(df.shape)

In [None]:
df = df.drop(columns=cols_to_segment)

#### Encoding categorical variables

In [None]:
def df_ord_encoding(data, encoder):
    """
    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html
    """
    cols_to_encode = data.select_dtypes(include=['category','object']).columns
    new_colnames = [col.replace('_segmented','') + '_encoded' for col in data.select_dtypes(include=['category','object']).columns.tolist()]
    data.loc[:,new_colnames] = encoder.fit_transform(data.loc[:,cols_to_encode])
    coding_dict = dict(zip(new_colnames, [{idx: value for idx,value in enumerate(_)} for _ in encoder.categories_]))
    return cols_to_encode, new_colnames, coding_dict

In [None]:
def df_cats_encoding(data):
    """
    https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    """
    cols_to_encode = data.select_dtypes(include=['category','object']).columns
    encoder = OneHotEncoder(dtype='int32')
    res = encoder.fit_transform(data[cols_to_encode])
    new_cols = [x for xs in [colname.replace('segmented','')+'_'+category for colname, category in zip(cols_to_encode, encoder.categories_)] for x in xs]
    return pd.concat([data, pd.DataFrame(res.toarray(), columns=new_cols)], axis=1).drop(columns=cols_to_encode)

##### One-hot encoding

In [None]:
%%time

df = df_cats_encoding(df.reset_index(drop=True))
print(df.shape)

##### Ordinal encoding

In [None]:
ord_encoder = OrdinalEncoder(dtype='int32')

In [None]:
%%time

original_columns, encoded_columns, coding_dict = df_ord_encoding(df, ord_encoder)
df = df.drop(columns=original_columns)
print(df.shape)

#### Feature engineering

In [None]:
# create new features

print(df.shape)
print(np.round(df.memory_usage(deep=True).sum() / 1024**2, 2))
df.head(3)

#### Stats, checks after encoding

In [None]:
print('No NULLs:\t\t\t', df.isnull().sum().sum() == 0)
print('All columns are numeric:\t', set(df.columns) == set(df._get_numeric_data().columns))
print('DataFrame shape:\t\t', df.shape)
print('Memory usage:\t\t\t', df.memory_usage(deep=True).sum()/1024**2)

#### Scaling

In [None]:
%%time

scaler = StandardScaler()

# sampling (optional)
# df = df.sample(frac=1).reset_index(drop=True)

X = scaler.fit_transform(df.drop(columns=['ID']))
print(X.shape)

In [None]:
# saving X matrix
np.save(f'{output_path}clustering_x.npy', X)

# saving preprocessed dataframe
df.to_pickle(f'{output_path}df_preproc.pkl')

#### PCA *(optional)*

In [None]:
pca = PCA().fit(X)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.set_xticks(np.arange(0, 100.5, 10))
ax.set_xticks(np.arange(0, 100.5, 5), minor=True)
ax.set_yticks(np.arange(.5, 1.05, .1))
ax.set_yticks(np.arange(.5, 1.05, .05), minor=True)
ax.grid(which='both')
ax.grid(which='minor', alpha=0.2)
ax.grid(which='major', alpha=0.6)

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('PCA')
plt.xlabel('N components')
plt.ylabel('Cumulative explained variance');

In [None]:
n_components = 10

X = PCA(n_components).fit_transform(X)
X.shape

### Clustering

#### K-means training

In [None]:
from sklearn.cluster import KMeans
model_nm = 'kmeans'

In [None]:
%%time
max_clusters = 11

# Within-Cluster Sum of Squares
wcss = []
for i in tqdm(range(1, max_clusters)):
    km = KMeans(
        n_clusters = i,
        init = 'k-means++',
        random_state=123
    )
    km.fit(X)
    wcss.append(km.inertia_)
    
plt.rcParams['figure.figsize'] = (10, 7)
plt.plot(range(1, max_clusters), wcss)
plt.plot(range(1, max_clusters), wcss, 'o', linewidth=.2, color='black')
plt.grid()
plt.title('Elbow method')
plt.xlabel('N clusters')
plt.ylabel('WCSS');

In [None]:
differences = np.diff(wcss) / wcss[:-1] * 100
for nm, diff in enumerate(differences):
    print(nm+2, '\t', np.round(diff,2))

In [None]:
n_clusters = 8

In [None]:
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=123)
kmeans.fit(X)

# save/load centroids (optional)
# np.save(output_path+'centroids.npy', kmeans.cluster_centers_)
# centroids = np.load(output_path+'centroids.npy', allow_pickle=True)
# kmeans = KMeans(n_clusters=n_clusters, random_state=123, init=centroids, max_iter=1)

In [None]:
column_name = f'{model_nm}_{n_clusters}'
df[column_name] = kmeans.labels_
df[column_name] = df[column_name].astype(str)
# df.drop(columns=column_name, inplace=True)

df.to_pickle(f'{output_path}df_labeled.pkl')

In [None]:
pca = PCA(n_components=2)
pca_res = pca.fit_transform(X)
plt.figure(figsize=(12,8))
plt.scatter(pca_res[:,0], pca_res[:,1], c=kmeans.labels_, s=5, cmap='viridis')
plt.title('PCA');

In [None]:
pd.Series(kmeans.labels_).value_counts().sort_index() # .to_clipboard()

In [None]:
pd.Series(kmeans.labels_).value_counts(normalize=True).sort_index()*100

In [None]:
# saving model
with open(f'{output_path}Model_segmentation_{model_nm}_{n_clusters}.pkl','wb') as f:
    pickle.dump(kmeans,f)

#### Using pre-trained model

In [None]:
# dataframe (preprocessed filtered data)
df = pd.read_pickle(f'{output_path}df_preproc.pkl')
print('DataFrame:', df.shape)

# X (scaled matrix)
X = np.load(f'{output_path}clustering_x.npy', allow_pickle=True)
print('X:', X.shape)

# load model
with open(f'{output_path}Model_segmentation_{model_nm}_{n_clusters}.pkl', 'rb') as f:
    model = pickle.load(f)
    print('Loading the model: Done')

In [None]:
model_labels = model.predict(X)

column_name = f'{model_nm}_{n_clusters}'
df[column_name] = model_labels
df[column_name] = df[column_name].astype(str)

### Feature importances

In [None]:
input_filename = f'{output_path}df_labeled.pkl'
output_filename = f'{output_path}feature_importances_{n_clusters}_labeled_{model_nm}.csv'
print('input_filename:\t\t',input_filename)
print('output_filename:\t',output_filename)

In [None]:
df = pd.read_pickle(input_filename).reset_index(drop=True)
print(df.shape)
df.head(3)

In [None]:
category_cols = df.select_dtypes(include=['category','object']).columns.tolist()
if category_cols:
    for col in tqdm(category_cols):
        df[col] = df[col].astype(str)

##### MultiClass (not recommended)

In [None]:
%%time

column_name = f'{model_nm}_{n_clusters}'
X, y = df.drop(columns=[column_name]), df[column_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

cat_features = X.select_dtypes(include='O').columns.tolist()

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

In [None]:
model = CatBoostClassifier(
    iterations=10,
    learning_rate=1,
    depth=2,
    loss_function='MultiClass'
)

model.fit(train_pool)
y_pred = model.predict(test_pool)

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix/ np.sum(cf_matrix), annot=True, fmt='.2%', cmap='Blues')

In [None]:
feature_imptce_df = pd.DataFrame({
    'feature_importance': model.get_feature_importance(train_pool), 
    'feature_names': X_train.columns
}).sort_values('feature_importance', ascending=False).reset_index(drop=True)

feature_imptce_df.to_csv(output_filename, index=False)
feature_imptce_df.head()

##### SingleClass (N-clusters times)

In [None]:
n_clusters

In [None]:
feature_imptce_df = pd.DataFrame()
column_name = f'{model_nm}_{n_clusters}'
X, y = df.drop(columns=[column_name]), df[column_name]

for label in tqdm(range(n_clusters)):
    new_y_labels = [0 if _ != label else 1 for _ in range(n_clusters)]
    new_y_labels_dict = dict(zip([str(_) for _ in range(n_clusters)], new_y_labels))

    y_single = y.map(new_y_labels_dict)
    X_train, X_test, y_train, y_test = train_test_split(X, y_single, test_size=0.33, random_state=123, stratify=y_single)

    cat_features = X_test.select_dtypes(include=['category','object']).columns.tolist()

    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

    model = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           verbose=False)

    model.fit(train_pool)
    y_pred = model.predict(test_pool)

    temp_feature_imptce_df = pd.DataFrame({
        'label':label,
        'feature_importance': model.get_feature_importance(train_pool), 
        'feature_names': X_train.columns
    }).sort_values('feature_importance', ascending=False).reset_index(drop=True)

    feature_imptce_df = pd.concat([feature_imptce_df, temp_feature_imptce_df])

print(feature_imptce_df['label'].nunique())
feature_imptce_df.head()

In [None]:
feature_imptce_df.to_csv(output_filename, index=False)
feature_imptce_df.to_clipboard(index=False)

In [None]:
print(feature_imptce_df[feature_imptce_df['feature_importance'] > 0]['feature_names'].size)
feature_imptce_df[feature_imptce_df['feature_importance'] > 0]['feature_names']

### Cluster analysis

#### Main features

In [None]:
stat_df = pd.DataFrame()

for model_ in [_ for _ in df.columns if model_nm in _]:
    tmp = df.groupby(model_, as_index=False).agg({'ID':'nunique','Income':'sum'})
    tmp['user_pct'] = tmp['ID'] / tmp['ID'].sum() * 100
    tmp['income_pct'] = tmp['Income'] / tmp['Income'].sum() * 100
    tmp['income_per_user'] = tmp['Income'] / tmp['ID']
    tmp['model'] = model_
    tmp.columns = ['cluster','user_cnt','Income','user_pct','income_pct','income_per_user','model']

    stat_df = pd.concat([stat_df, tmp[['model','cluster','user_cnt','user_pct','Income','income_pct','income_per_user']]])

stat_df.to_clipboard(index=False)
stat_df.head()

In [None]:
metric_cols = ['Income', 'Kidhome','Teenhome', 'Recency', 'Year_Birth']

In [None]:
df.columns

In [None]:
n = len(metric_cols)
y = column_name
order = df[y].sort_values().unique().tolist()
fig, ax = plt.subplots(n,1,figsize=(8, 2*n))
c=0

for f in metric_cols:
    sns.boxplot(data=df, x=f, y=y, order = order, palette = 'viridis', showfliers=False, ax=ax[c])
    ax[c].set(xlabel='',ylabel='')
    ax[c].set_title(f, loc='left')
    c+=1

fig.tight_layout()
# plt.savefig(f'{output_path}{y}.png')

In [None]:
feature = 'Income'

In [None]:
t = df.groupby([df[feature]==1, y], as_index=False).size()
# t = df.groupby([feature, y], as_index=False).size()
# t = df.groupby([df[feature].isin(range(90)), y], as_index=False).size()

t['part'] = t['size'] / t.groupby(y)['size'].transform(sum) * 100
t.drop(columns='size').T

In [None]:
percs = [.1,.2,.25,.3,.4,.5,.6,.7,.75,.8,.85,.9,.95,.99]
print(feature)
df.groupby(y)[feature].describe(percentiles=percs).T

#### Additional features

In [None]:
feature = 'Marital_Status_Divorced'

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8, 2))

ax = sns.boxplot(data=df, x=feature, y=column_name, order = order, palette = 'viridis', showfliers=False)
ax.set(xlabel='',ylabel='')
ax.set_title(feature, loc='left')

fig.tight_layout()

In [None]:
percs = [.1,.2,.25,.3,.4,.5,.6,.7,.75,.8,.85,.9,.95,.99]
print(feature)
df.groupby(column_name)[feature].describe(percentiles=percs).T

In [None]:
# t = df.groupby([feature, 'cluster'], as_index=False).size()
t = df.groupby([df[feature]>0, column_name], as_index=False).size()

t['part'] = t['size'] / t.groupby(column_name)['size'].transform(sum) * 100
t.drop(columns='size').T #.to_clipboard()