# Customarized functions
* stats_description - Extract descriptive statistics between categorical and numeric features.
* cat_ounts - Perform counts on categorical features in the data.

In [1]:
import pandas as pd
import numpy as np

In [2]:
def stats_description(df, col1, col2):
    '''
    Input: 
    df - dataframe
    Col1 = categorical column in the df
    col2 = numeric column column in the df
    
    output: 
    sats_summary = A table showing the statistical summary of features of col1 in relation col2
    Sats_summary columns = Mean, median, standard deviation(std),maximum, minimum, 5th & 95th percentile,
                            count of col1 and percentage share of col1

    This function performs counts on categorical variables
    1. Group the dataframe by the categorical variable in relation to a numeric varible.
    2. Perform operation on the groupby dataframe (mean, median,std,max,min, quantiles([.5,95]))
    3. Perform a count on the categorical column(col1) and calculate the percentage.
    4. Merge all the statistcial features into a new dataframe
    '''   
    dict1 = df.groupby(col1)[col2].mean().to_dict()
    dict2 = df.groupby(col1)[col2].median().to_dict()
    dict3 = df.groupby(col1)[col2].std().to_dict()
    dict4 = df.groupby(col1)[col2].max().to_dict()
    dict5 = df.groupby(col1)[col2].min().to_dict()
    dict6 = df.groupby(col1)[col2].quantile(.5).to_dict()
    dict7 = df.groupby(col1)[col2].quantile(.95).to_dict()
    dicts = [dict1, dict2, dict3, dict4, dict5,dict6,dict7]
    result_dict = {}
    for d in dicts:
        for k, v in d.items():
            try:
                result_dict.setdefault(k,[]).extend(v)
            except TypeError:
                result_dict[k].append(v)
    
    sats_summary = pd.DataFrame(result_dict, index = ['mean_' + col2, 'median_' + col2, 'std_' + col2, 'max_' + col2, 'min_' + col2, 
                                                      '5th perc_' + col2 ,'95th perc_' + col2 ]).T
    
    return sats_summary

In [3]:
def cat_counts(df,col):
    '''
    Function performs counts and calculates percentage shares of categorical columns
    
    INPUT
    df - Dataframe [obj]
    col - categorical column[bool,str]
    
    OUTPUT
    dff - Dataframe consisting of 2 columns (COUNTS & PERCENTAGES)
   
    '''
    df[col].value_counts()
    room_dict = df[col].value_counts().to_dict()
    room_list=[]
    for key, value in room_dict.items():
        temp = [key,value]
        room_list.append(temp)
    dff = pd.DataFrame(room_list, columns=[col,'Count'])
    dff['Perc'] = dff['Count']/np.sum(dff['Count'])
    
    return dff