In [7]:
import pandas as pd
import numpy as np

### Take as input a dataframe and plot important correlations on a heatmap

In [1]:
# Calculate correlation and p value and return pvalue dataframe
def calculate_pvalues(df):  
    from scipy.stats import pearsonr
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues

In [2]:
# Filter dataframe based on value and condition("<", "=", ">")
def value_to_NaN(df, value, op):
    filtered_values = df
    if op == ">":
        for col in filtered_values.columns:
            filtered_values.loc[filtered_values[col].astype(float) > value, col] = "NaN"
    elif op == "=":
        for col in filtered_values.columns:
            filtered_values.loc[filtered_values[col].astype(float) == value, col] = "NaN"
    elif op == "<":
        for col in filtered_values.columns:
            filtered_values.loc[filtered_values[col].astype(float) < value, col] = "NaN"
    else:
        filtered_values = "opperation is not supported"
    return filtered_values

In [3]:
# Change values in dataframe to NaN based on pvalue
def filter_with_p (df, p):
    new_df = df
    for col in df.columns:
        for index in df.index:
            if p.loc[col, index] == "NaN":
                new_df.loc[col, index] = "NaN"
    return new_df

In [4]:
# Plot heatmap of correlation dataframe
def ut_heatmap_plot(corr_df, y=15, x=10):
    plt.figure(figsize=(y, x))
    corr_df = corr_df.astype(float)
    mask = np.triu(np.ones_like(corr_df))
    dataplot = sns.heatmap(corr_df, cmap="YlGnBu", annot=True, mask=mask)
    return plt.show()

In [None]:
# All in one example
def final_heatmap(df):
    return ut_heatmap_plot(filter_with_p(df.corr(method = 'pearson', min_periods = 1).round(2),
                                         value_to_NaN(calculate_pvalues(df), 0.05, ">")))

### For google adds dataframe, group and sum, without losing percentage values

In [6]:
def add_group_ads(df):
    new_df = pd.DataFrame
    new_df = df.groupby(['AdGroup']).sum()
    new_df["CTR"] = new_df['Clicks'] / new_df['Impressions']
    new_df["Cost_per_Conversion"] = new_df['Cost'] / new_df['Conversions']
    new_df["Conversion_rates"] = new_df['Conversions'] / new_df['Clicks']
    new_df["Avg_CPC"] = new_df['Cost'] / new_df['Clicks']
    new_df['Avg. position'] = new_df['Avg. position'] / new_df['Counter']
    new_df["cost_per_conversion_value"] = new_df['Cost'] / new_df['Total conv. value']
    new_df.replace([np.inf, -np.inf], 0, inplace=True)
    return new_df

### Get dataframe, row , columns, title, xlabel, figsize and Plot bar plot

In [9]:
def bar_plot(df, row, col, title, xlabel, y=12, x=8):
    add_group_ads(df).loc[row, col].plot(kind="bar", figsize=(y, x))       
    plt.xticks(rotation=30, horizontalalignment="center")
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("Values")    
    return 

#row and col can be list of str values since we are using loc

### Confidence interval for correlation

In [None]:
# Find standar error and t,z
def se_z(df, alpha=0.05):  
    se = 1/np.sqrt(len(df.index)-3)
    z = stats.norm.ppf(1-alpha/2)
    return se, z
# Find low and high
def conf_inter(r, se, z):
    r_z = np.arctanh(r) 
    lo_z, hi_z = r_z-z*se, r_z+z*se
    lo, hi = np.tanh((lo_z, hi_z))
    lo = round(lo, 4)
    hi = round(hi, 4)
    lo_and_hi = "({in1}, {in2})".format(in1=str(lo), in2=str(hi))
    return lo_and_hi
# return confidence interval dataframe
def ci_df(df, se, z):
    se = se
    z = z
    ci = df.copy()
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            ci_value = conf_inter(tmp[r][c], se, z)
            ci[r][c] = ci_value
    return ci 

#-----------final example all in one---------------
def corr_ci_df(df, alpha=0.05, iloc_col_range):
    se, z = se_z(df, alpha)

    corr_ci_df = ci_df(df.corr(), se, z)

    return corr_ci_df.iloc[:, iloc_col_range].to_frame()

### Group columns in new dataframe based on str value and return the new dataframe plus the number of columns added

In [10]:
def survey_grouping(df,col_arg):
    new_df = pd.DataFrame()
    k = 0
    for i in df.columns:
        if col_arg in i:
            new_df[i] = df[i]
            k+=1
    return new_df, k

### Add columns in dataframe (dataframe, column_values, desired_column_name) and returns dataframe 

In [11]:
def add_col(df, columns, col_names):
    i = 0
    for col in columns:
        df[col_names[i]] = col
        i+=1
    return df

### Find IQR in dataframe

In [12]:
def find_iqr(df, x):
    return df.aply(np.subtract(*np.percentile(x, [75, 25])))