# Grown Strong - March 2021 Survey Analysis
2021-03-30

## Introduction
 - Talk about the project/task
 - Discuss TwoKai's collaboration
 - Infrsstructure and tools used

In [321]:
# Load packages

## General packages
import pandas as pd
import numpy as np
from scipy import stats
import os
import random
import sys

## Data vis packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

## Source functions
sys.path.append("..") # Adds higher directory to python modules path.
from src.helper_functions import extract_email_domain, clean_free_text, clean_fitness_cols

## Params
pd.set_option('display.max_columns', 500)

In [322]:
# Read in data

## Get directory and file paths
input_dir = '../input/'
df_path   = os.path.join(input_dir, '2021_03_22_grown_strong_survey_results.csv')

## Read in survey data
df = pd.read_csv(
    df_path,
    ## Rename columns
    header = 0,
    names = [
        'response_id', 'name', 'email_address', 'age', 'gender', 'ethnicity', 'home_location', 
        'household_income', 'num_work_week_hrs', 'has_children', 'fitness_level', 'nutrition_level', 
        'olympic_lifting_experience', 'fitness_goals0', 'fitness_goals1', 'fitness_goals2', 'fitness_goals3', 
        'fitness_goals4', 'fitness_goals5', 'fitness_goals6', 'fitness_goals7', 'fitness_goals8', 
        'most_used_gs_program', 'num_gs_sessions_per_week', 'workout_location', 'uses_other_workouts',  
        'uses_other_workouts_further_info', 'gs_provision_suggestion', 'has_joined_facebook_group',  
        'gs_improvement_suggestion0', 'why_not_joined_facebook_group', 'gs_likely_recommendation',  
        'gs_improvement_suggestion1', 'unnamed'
    ]
)

## Remove last column as unneeded
df = df.iloc[:, 0:-1]

In [323]:
# Clean name

## Convert to lowercase
df['name'] = df['name'].str.lower()
## Remove punctuation
df['name'] = df['name'].str.replace(r'[^\w\s]+', '', regex = True)

In [324]:
# Clean email_address

## Convert to lowercase
df['email_address'] = df['email_address'].str.lower().str.strip()

# Extract email domain into another feature
df['email_domain'] = df['email_address'].apply(extract_email_domain).str.strip()

In [325]:
# Clean age

## Remove unneeded substring and whitespace
df['age'] = df['age'].str.replace('years old', '').str.strip()

In [326]:
# Clean gender

## Convert to lowercase
df['gender'] = df['gender'].str.lower()
## Map female and male to f and m respectively
df['gender'] = df['gender'].str.replace('female', 'f', regex = False)
df['gender'] = df['gender'].str.replace('male', 'm', regex = False)
## Map 'prefer not to answer' to NA
df.loc[df.gender == 'prefer not to answer', 'gender'] = np.nan

In [327]:
# Clean ethnicity

## Convert to lowercase
df['ethnicity'] = df['ethnicity'].str.lower()
## Map '/' to 'or'
df['ethnicity'] = df['ethnicity'].str.replace('/', ' or ', regex = False)
## Map 'prefer not to answer' to NA
df.loc[df.ethnicity == 'prefer not to answer', 'ethnicity'] = np.nan

In [328]:
# Clean home_location

## Convert to lowercase
df['home_location'] = df['home_location'].str.lower()
## Map all 'other' values to only 'other'
df.loc[df.home_location.str.contains('other'), 'home_location'] = 'other'

In [329]:
# Clean household_income

## Convert to lowercase
df['household_income'] = df['household_income'].str.lower()
## Remove $ symbols
df['household_income'] = df['household_income'].str.replace('$', '', regex = False)
## Map 'prefer not to answer' to NA
df.loc[df.household_income == 'prefer not to answer', 'household_income'] = np.nan

In [330]:
# Clean num_work_week_hrs

## Convert to lowercase
df['num_work_week_hrs'] = df['num_work_week_hrs'].str.lower()
## Clean 'to' and '-' symbols so consistent
df['num_work_week_hrs'] = df['num_work_week_hrs'].str.replace(' to ', '-', regex = False)
## Map 'currently not working' to 0
df.loc[df.num_work_week_hrs == 'currently not working', 'num_work_week_hrs'] = 0
## Clean '0 to 25 hours a week' values
df.loc[df.num_work_week_hrs == '0 to 25 hours a week', 'num_work_week_hrs'] = '1 to 25 hours a week'
## Remove 'hours a week'
df['num_work_week_hrs'] = df['num_work_week_hrs'].str.replace('hours a week', '').str.strip()

In [331]:
# Clean has_children

## Convert strings to booleans
df.loc[df.has_children == 'Yes', 'has_children'] = 1
df.loc[df.has_children == 'No', 'has_children'] = 0
df['has_children'] = df['has_children'].astype('float')

In [332]:
# Clean fitness goals cols

## Get fitness_goals colnames
cols_fitness_goals = df.columns[df.columns.str.startswith('fitness_goals')]

## Concatenate fitness goals cols using concat_cols
for idx, col_current in enumerate(cols_fitness_goals):
    if idx == 0:
        col_previous = cols_fitness_goals[0]
    
    ## Remove 'other (please specify):' string
    df[col_current] = df[col_current].str.lower()
    df[col_current] = df[col_current].str.replace('other (please specify):', '', regex = False)
    
    # If column is free text field, replace commas with periods
    if idx == len(cols_fitness_goals):
            df[col_current] = df[col_current].str.replace(',', '.', regex = False)
    
    ## Convert to string and concatenate
    df['fitness_goals'] = df[col_previous].astype('str') + ', ' + df[col_current].astype('str')
    
    ## Remove any nans included in concatenation
    df['fitness_goals'] = df['fitness_goals'].str.replace('nan, ', '', regex = False)
    df['fitness_goals'] = df['fitness_goals'].str.replace(', nan', '', regex = False)
    df['fitness_goals'] = df['fitness_goals'].str.replace('nan', '', regex = False)
    df['fitness_goals'] = df['fitness_goals'].str.replace(r'^, ', '', regex = True)
    
    col_previous = 'fitness_goals'


## Clean col names and convert to booleans following concatenation
df = clean_fitness_cols(df, 'fitness_goals', 'lose fat', 'goal_lose_fat')
df = clean_fitness_cols(df, 'fitness_goals', 'gain muscle', 'goal_gain_muscle')
df = clean_fitness_cols(df, 'fitness_goals', 'maintain fitness levels', 'goal_maintain_fitness')
df = clean_fitness_cols(df, 'fitness_goals', 'gain weight', 'goal_gain_weight')
df = clean_fitness_cols(df, 'fitness_goals', 'get ready for a competition', 'goal_competition_ready')
df = clean_fitness_cols(df, 'fitness_goals', 'improve crossfit skills', 'goal_improve_crossfit')
df = clean_fitness_cols(df, 'fitness_goals', 'get stronger', 'goal_get_stronger')
df = clean_fitness_cols(df, 'fitness_goals', 'gain more confidence', 'goal_gain_confidence')
df = df.rename(columns={cols_fitness_goals[-1]:'goal_other'})

In [333]:
# Clean num_gs_sessions_per_week

## Remove 'times a week' string
df['num_gs_sessions_per_week'] = df['num_gs_sessions_per_week'].str.replace(r'time.? a week', '', regex = True)
## Replace ' to ' with hyphen for consistency
df['num_gs_sessions_per_week'] = df['num_gs_sessions_per_week'].str.replace(' to ', '-', regex = False)
df['num_gs_sessions_per_week'] = df['num_gs_sessions_per_week'].str.strip()

In [334]:
# Clean workout_location

## Map '/' to 'or'
df['workout_location'] = df['workout_location'].str.replace('/', ' or ', regex = False)
## Remove 'other (please specify):' string
df['workout_location'] = df['workout_location'].str.lower()
df['workout_location'] = df['workout_location'].str.replace('other (please specify):', '', regex = False)

## Manual cleaning of values
df['workout_location'] = df['workout_location'].str.replace('home gym- inside', 'inside house', regex = False)
df['workout_location'] = df['workout_location'].str.replace('boat', 'inside house', regex = False)

In [335]:
# Clean uses_other_workouts

## Convert strings to booleans
df.loc[df.uses_other_workouts == 'Yes', 'uses_other_workouts'] = 1
df.loc[df.uses_other_workouts == 'No', 'uses_other_workouts'] = 0
df['uses_other_workouts'] = df['uses_other_workouts'].astype('float')

In [336]:
# Clean uses_other_workouts_further_info

df['uses_other_workouts_further_info'] = clean_free_text(df, 'uses_other_workouts_further_info')

In [337]:
# Clean gs_provision_suggestion

df['gs_provision_suggestion'] = clean_free_text(df, 'gs_provision_suggestion')

In [338]:
# Clean has_joined_facebook_group

## Convert strings to booleans
df.loc[df.has_joined_facebook_group == 'Yes', 'has_joined_facebook_group'] = 1
df.loc[df.has_joined_facebook_group == 'No', 'has_joined_facebook_group'] = 0
df.loc[df.has_joined_facebook_group == 'I do not have Facebook', 'has_joined_facebook_group'] = np.nan
df['has_joined_facebook_group'] = df['has_joined_facebook_group'].astype('float')

In [339]:
# Clean gs_improvement_suggestion1

df['gs_improvement_suggestion0'] = clean_free_text(df, 'gs_improvement_suggestion0')

In [340]:
# Clean why_not_joined_facebook_group

df['why_not_joined_facebook_group'] = clean_free_text(df, 'why_not_joined_facebook_group')

In [341]:
# Clean gs_improvement_suggestion1

df['gs_improvement_suggestion1'] = clean_free_text(df, 'gs_improvement_suggestion1')

In [342]:
# Get high-level survey summary stats

## Survey dimensions
num_respondents_unique = len(df[['name', 'email_address']].drop_duplicates())

original_cols = df.iloc[:, 1:-2].columns
num_questions = len(original_cols) - 9

## Dataset dimensions
df_dims     = df[original_cols].shape
df_num_rows = df_dims[0]
df_num_cols = df_dims[1]

## Number of duplicated records
num_duplicated_records = df_num_rows - num_respondents_unique

## Survey summary stats table
survey_dims_summary_stats = pd.DataFrame(
    [num_respondents_unique, df_num_rows, num_questions, df_num_cols],
    [
        'Number of unique respondents', 'Number of rows in dataset', 
        'Number of questions asked in survey', 'Number of columns in dataset'
    ],
    columns = ['']
)

In [343]:
# Get variable summary stats

## Create dataframe with high-level dtypes
variable_dtypes = df[original_cols]
variable_dtypes = pd.DataFrame(variable_dtypes.dtypes, columns=['dtype']).reset_index(drop=True)
variable_dtypes.loc[variable_dtypes.dtype == 'int64',   'dtype'] = 'Numerical'
variable_dtypes.loc[variable_dtypes.dtype == 'float64', 'dtype'] = 'Numerical'
variable_dtypes.loc[variable_dtypes.dtype == 'object',  'dtype'] = 'Categorical'
## Get frequencies for each dtype
variable_dtypes = pd.DataFrame(variable_dtypes.value_counts()).reset_index().rename(columns={0:'freq'})

variable_dtypes_num_categorical = variable_dtypes.loc[variable_dtypes.dtype == 'Categorical', 'freq'].item()
variable_dtypes_num_numerical   = variable_dtypes.loc[variable_dtypes.dtype == 'Numerical', 'freq'].item()
variable_dtypes_num_free_text   = 5

variable_dtypes_chart = px.bar(
    variable_dtypes, 
    y='freq', x='dtype', title='Survey Dataset Variable Data Type Frequency',
    labels=dict(freq='Frequency', dtype='Data Type'),
    width=900, height=400
)

In [344]:
# Get redacted head of data to display

df_head = df[original_cols].iloc[:, 3:].head()

In [345]:
# Get variable missingness stats

## Create missingness dataframe
missingness_df = pd.DataFrame(round(df[original_cols].isnull().sum() / len(df) * 100, 1))
missingness_df = missingness_df.reset_index().rename(columns={'index':'variable', 0:'missing_percentage'})
missingness_df = missingness_df.sort_values('missing_percentage', ascending=False)

## Plot missingness
missingness_chart = px.bar(
    missingness_df, 
    x='missing_percentage', y='variable', title='Percentage of Missing Values by Variable',
    labels=dict(missing_percentage='Percentage of Missing Values (%)', variable='Variable'),
    width=1000, height=850
)

variables_no_missing_vals = ", ".join(
    list(missingness_df.loc[missingness_df.missing_percentage == 0, 'variable'])
)

variable_most_missing_vals = missingness_df.reset_index(drop=True).loc[0, 'variable']
variables_most_missing_vals_pct = missingness_df.reset_index(drop=True).loc[0, 'missing_percentage']

In [None]:
# age analysis

age_freqs = pd.DataFrame(df['age'].value_counts()).reset_index().rename(columns={'age':'freq', 'index':'age'})
age_freqs = age_freqs.sort_values('age')

## Get mode age-group
age_freqs_mode_age = age_freqs.loc[0, 'age']
age_freqs_mode_age_freq = age_freqs.loc[0, 'freq']

## Get anti-mode age-group
age_freqs_antimode_age = age_freqs.loc[len(age_freqs)-1, 'age']
age_freqs_antimode_age_freq = age_freqs.loc[len(age_freqs)-1, 'freq']

## Plot age histogram
age_freqs_chart = px.bar(
    age_freqs, 
    x='age', y='freq', title='Number of Respondents by Age-Group',
    labels=dict(age='Age', freq='Frequency'),
    width=1000, height=500
)

In [None]:
# gender analysis

gender_freqs = pd.DataFrame(
    df['gender'].value_counts()).reset_index().rename(columns={'gender':'freq', 'index':'gender'}
)
gender_freqs = gender_freqs.sort_values('gender')
gender_freqs.loc[gender_freqs.gender == 'f', 'gender'] = 'Female'
gender_freqs.loc[gender_freqs.gender == 'm', 'gender'] = 'Male'

## Get mode gender-group
gender_freqs_mode_gender = gender_freqs.loc[0, 'gender']
gender_freqs_mode_gender_freq = gender_freqs.loc[0, 'freq']

## Get anti-mode gender-group
gender_freqs_antimode_gender = gender_freqs.loc[len(gender_freqs)-1, 'gender']
gender_freqs_antimode_gender_freq = gender_freqs.loc[len(gender_freqs)-1, 'freq']

## Plot gender histogram
gender_freqs_chart = px.bar(
    gender_freqs, 
    x='gender', y='freq', title='Number of Respondents by Gender',
    labels=dict(gender='Gender', freq='Frequency'),
    width=1000, height=500
)

In [None]:
# ethnicity analysis

ethnicity_freqs = pd.DataFrame(
    df['ethnicity'].value_counts()).reset_index().rename(columns={'ethnicity':'freq', 'index':'ethnicity'}
)
ethnicity_freqs = ethnicity_freqs.sort_values('freq')
ethnicity_freqs.loc[ethnicity_freqs.ethnicity == 'african-american', 'ethnicity'] = 'African-American'
ethnicity_freqs.loc[ethnicity_freqs.ethnicity == 'other or unknown', 'ethnicity'] = 'Other or Unknown'
ethnicity_freqs.loc[ethnicity_freqs.ethnicity == 'asian', 'ethnicity'] = 'Asian'
ethnicity_freqs.loc[ethnicity_freqs.ethnicity == 'two or more', 'ethnicity'] = 'Two or more'
ethnicity_freqs.loc[ethnicity_freqs.ethnicity == 'latino or hispanic', 'ethnicity'] = 'Latino or Hispanic'
ethnicity_freqs.loc[ethnicity_freqs.ethnicity == 'caucasian', 'ethnicity'] = 'Caucasian'

## Get mode ethnicity-group
ethnicity_freqs_mode_ethnicity = ethnicity_freqs.loc[0, 'ethnicity']
ethnicity_freqs_mode_ethnicity_freq = ethnicity_freqs.loc[0, 'freq']

## Get anti-mode ethnicity-group
ethnicity_freqs_antimode_ethnicity = ethnicity_freqs.loc[len(ethnicity_freqs)-1, 'ethnicity']
ethnicity_freqs_antimode_ethnicity_freq = ethnicity_freqs.loc[len(ethnicity_freqs)-1, 'freq']

## Plot ethnicity histogram
ethnicity_freqs_chart = px.bar(
    ethnicity_freqs, 
    y='ethnicity', x='freq', title='Number of Respondents by Ethnicity',
    labels=dict(ethnicity='Ethnicity', freq='Frequency'),
    width=1000, height=500
)

In [419]:
# home_location analysis

home_location_freqs = pd.DataFrame(
    df['home_location'].value_counts()
).reset_index().rename(columns={'home_location':'freq', 'index':'home_location'})

home_location_freqs = home_location_freqs.sort_values('freq')
home_location_freqs.loc[home_location_freqs.home_location == 'asia', 'home_location'] = 'Asia'
home_location_freqs.loc[home_location_freqs.home_location == 'australia', 'home_location'] = 'Australia'
home_location_freqs.loc[home_location_freqs.home_location == 'europe', 'home_location'] = 'Europe'
home_location_freqs.loc[
    home_location_freqs.home_location == 'north america/central america', 'home_location'
] = 'North America/Central America'
home_location_freqs.loc[home_location_freqs.home_location == 'other', 'home_location'] = 'Other'

## Get mode home_location-group
home_location_freqs_mode_home_location = home_location_freqs.loc[0, 'home_location']
home_location_freqs_mode_home_location_freq = home_location_freqs.loc[0, 'freq']

## Get anti-mode home_location-group
home_location_freqs_antimode_home_location = home_location_freqs.loc[len(home_location_freqs)-1, 'home_location']
home_location_freqs_antimode_home_location_freq = home_location_freqs.loc[len(home_location_freqs)-1, 'freq']

## Plot home_location histogram
home_location_freqs_chart = px.bar(
    home_location_freqs, 
    y='home_location', x='freq', title='Number of Respondents by Home Location',
    labels=dict(home_location='Home Location', freq='Frequency'),
    width=1000, height=500
)

In [428]:
# household_income analysis

household_income_freqs = pd.DataFrame(
    df['household_income'].value_counts()
).reset_index().rename(columns={'household_income':'freq', 'index':'household_income'})

household_income_freqs_order = [3, 2, 1, 0, 4]
household_income_freqs = household_income_freqs.iloc[household_income_freqs_order, :]

household_income_freqs.loc[
    household_income_freqs.household_income == 'less than 25,000', 'household_income'
] = 'Less than 25,000'
household_income_freqs.loc[
    household_income_freqs.household_income == 'more than 200,000', 'household_income'
] = 'More than 200,000'

## Get mode household_income-group
household_income_freqs_mode_household_income = household_income_freqs.loc[0, 'household_income']
household_income_freqs_mode_household_income_freq = household_income_freqs.loc[0, 'freq']

## Get anti-mode household_income-group
household_income_freqs_antimode_household_income = household_income_freqs.loc[
    len(household_income_freqs)-1, 'household_income'
]
household_income_freqs_antimode_household_income_freq = household_income_freqs.loc[
    len(household_income_freqs)-1, 'freq'
]

## Plot household_income histogram
household_income_freqs_chart = px.bar(
    household_income_freqs, 
    x='household_income', y='freq', title='Number of Respondents by Household Income Group',
    labels=dict(household_income='Household Income Group', freq='Frequency'),
    width=1000, height=500
)

In [None]:
# num_work_week_hrs analysis

In [None]:
# has_children analysis

# correlate with num_work_week_hrs to see if working parents

In [None]:
# fitness_level analysis

In [None]:
# nutrition_level analysis

# correlate with fitness_level to see emphasis on nutrition vs fitness

In [None]:
# email_address analysis

In [378]:
df.columns

Index(['response_id', 'name', 'email_address', 'age', 'gender', 'ethnicity',
       'home_location', 'household_income', 'num_work_week_hrs',
       'has_children', 'fitness_level', 'nutrition_level',
       'olympic_lifting_experience', 'goal_lose_fat', 'goal_gain_muscle',
       'goal_maintain_fitness', 'goal_gain_weight', 'goal_competition_ready',
       'goal_improve_crossfit', 'goal_get_stronger', 'goal_gain_confidence',
       'goal_other', 'most_used_gs_program', 'num_gs_sessions_per_week',
       'workout_location', 'uses_other_workouts',
       'uses_other_workouts_further_info', 'gs_provision_suggestion',
       'has_joined_facebook_group', 'gs_improvement_suggestion0',
       'why_not_joined_facebook_group', 'gs_likely_recommendation',
       'gs_improvement_suggestion1', 'email_domain', 'fitness_goals'],
      dtype='object')

## Survey Summary Statistics
This section provides a brief overview of the March 2021 survey, and provides an analysis of the high-level results of the survey. It looks particularly at the performance of the survey in terms of unique respondents, missingness and data quality, as well as summarising the dimensions and variables of the dataset returned from the survey.

The main purpose of the March 2021 survey was to retrospectively collect data on Grown Strong's customers. A particular emphasis of *understanding the customers* was sought; including customer demographics and the thoughts and opinions of Grown Strong's products from the customers' perspective.

It is important to note that, while surveys can of course return a rich sample of data, it does not necessarily reflect the true population of customers. If a survey is completely optional and has no significant incentive (e.g. monetary prizes) respondents will often be those that feel strongly in one way or another. This can make the data inherently polarised, with highly satisfied and very unsatisfied responding most. 

For this reason, TwoKai's reccommendation is to use the following analysis of the March 2021 survey as a springboard for further questioning and ideation, and consequently further and more precise data collection (with the intention of getting a larger sample size that will reflect the true population better, with desired variables).

### High-Level Statistics
**Figure 1** shows that there were {{num_respondents_unique}} unique respondents, as well as {{df_num_rows}} records in the data. There were therefore {{num_duplicated_records}} duplicated respondents, (uniqueness based on name and email address variables).

In the survey, {{num_questions}} different questions were asked in total. As shown in **Figure 1**, this resulted in {{df_num_cols}} columns in the dataset - due to questions that allowed an "all that apply" selection answer. It is worth noting that these types of questions in future data collection can result in highly dimensional data with numerous columns (which make data cleaning and modelling more complex).

<br>{{survey_dims_summary_stats}}
<center><b><br>Figure 1</b>: Table of summary statistics for the March 2021 <br>Grown Strong customer survey.</center><br>

### Summary of Variables

**Figure 2** shows that, after cleaning the data, there were {{variable_dtypes_num_categorical}} categorical variables returned and {{variable_dtypes_num_numerical}} numerical/boolean variables. Of the {{variable_dtypes_num_categorical}} categorical variables, there were {{variable_dtypes_num_free_text}} variables that offered customers to write freely and offer thoughts, opinions and suggestions (as opposed to mandatory free text fields). These variables in particular are analysed in later sections, as they offer some of the most critical insights into Grown Strong's customers.

In [346]:
variable_dtypes_chart.show()

<center><b>Figure 2</b>: Frequency of categorical and numerical variables returned from the survey.</center><br>

The first 5 rows of the cleaned survey results dataset for, with name and email address variables removed for anonymisation, are displayed in **Figure 3** for manual examination.

<br>{{df_head}}
<center><b><br>Figure 3</b>: First 5 rows of the cleaned survey results dataset.</center><br>

### Variable Missingness

Cleaned variables with no missing values were {{variables_no_missing_vals}}. 

The cleaned variable with the most missing values was {{variable_most_missing_vals}} with {{variables_most_missing_vals_pct}}% of values missing. 

**Figure 4** shows the percentage of missing values for all cleaned variables. The optional free text field variables had the fewest responses.

In [347]:
missingness_chart.show()

<center><b>Figure 4</b>: Percentage of missing values for each variable arranged in order of ascending missingness percentage.</center><br>

## Cohort Definitions

This section analyses the data and its cleaned variables in greater detail in order to provide a better description of the customers that responded to the survey. Variables are examined and presented in no particular order, however variables with fewer missing values are presented first.

### Age 
Despite reasonably low numbers for the survey, the distribution of ages is apparently normally distributed as would be expected with products that Grown Strong provide. There is a right skew to the distribution, indicating that respondents are more likely to be older than younger - there is quite a large disparity between age-groups 18-24 and 25-34. 

**Figure 5** shows a histogram of age-group. The age-group with the most respondents was {{age_freqs_mode_age}} with {{age_freqs_mode_age_freq}} respondents in total. Conversely, the age-group with the fewest respondents was {{age_freqs_antimode_age}} with {{age_freqs_antimode_age_freq}} respondents in total. 

The clear disparity between age-groups 18-24 and 25-34 is worth noting. It could indicate that most of the respondents in the 25-34 age-group are at the higher end of this bin, as reflected by the right skew of the distribution. Further probing here could be valuable, and if collection of customer age data is pursued in future, it would be useful to collect dates of birth to get a fully granular picture. If most customers in the 25-34 age-group are in fact at the higher end of this bin, it could be due to lower ages being priced out among other reasons. More granular data would be required to explore this hypothesis further.

In [361]:
age_freqs_chart.show()

<center><b>Figure 5</b>: Number of respondents by age-group, in order of ascending age-group.</center><br>

### Gender
**Figure 6** displays the number of respondents split by gender. {{gender_freqs_mode_gender}} respondents were highest in number by a large margin, with {{gender_freqs_mode_gender_freq}} respondents, while there were only {{gender_freqs_antimode_gender_freq}} {{gender_freqs_antimode_gender}} respondents.

In [396]:
gender_freqs_chart.show()

<center><b>Figure 6</b>: Number of respondents by gender.</center><br>

### Ethnicity
{{ethnicity_freqs_mode_ethnicity}} respondents were most numerous in the survey, with {{ethnicity_freqs_mode_ethnicity_freq}} responses; this was a clear majority when compared with other ethnicity options. For example, there was only {{ethnicity_freqs_antimode_ethnicity_freq}} response from {{ethnicity_freqs_antimode_ethnicity}} respondents. **Figure 7** shows the number of respondents for all ethnicity options used in the survey.

Data on ethnicity is notoriously difficult to collect and analyse due to the sensitivity of the data. Often, customers feel uncomfortable having these data recorded, and navigating the multitude of different options required to include for effective ethnicity data collection can be difficult. Furthermore, using ethnicity data in statistical modelling methods like machine learning is controversial at best, as it opens possibilities for racial biases to be unintentionally captured in algorithms. Customer segmentation based on ethnicity, even if used alongside other variables, can also pose ethical issues in terms of generalisation. Unless Grown Strong have ethical reasons to attempt to target specific ethnicities (for example, using ethnicity to *remove* racial biases from algorithms), TwoKai recommends using these data with extreme caution.

In [410]:
ethnicity_freqs_chart.show()

<center><b>Figure 7</b>: Number of respondents by Ethnicity.</center><br>

### Home Location
The Home Location that respondents selected most was {{home_location_freqs_mode_home_location}} with {{home_location_freqs_mode_home_location_freq}} respondents. This was a clear majority (the Home Location with fewest respondents was {{home_location_freqs_antimode_home_location}} with only {{home_location_freqs_antimode_home_location_freq}} respondent).

Given the number of respondents selecting {{home_location_freqs_mode_home_location}} as their Home Location, it may be prudent to increase granularity for this area in future data collection, by splitting the option into more specific locations. **Figure 8** below shows the number of respondents by all Home Locations.

In [422]:
home_location_freqs_chart.show()

<center><b>Figure 8</b>: Number of respondents by Home Location.</center><br>

### 

In [None]:
# household_income analysis

household_income_freqs = pd.DataFrame(
    df['household_income'].value_counts()
).reset_index().rename(columns={'household_income':'freq', 'index':'household_income'})

household_income_freqs_order = [3, 2, 1, 0, 4]
household_income_freqs = household_income_freqs.iloc[household_income_freqs_order, :]

household_income_freqs.loc[
    household_income_freqs.household_income == 'less than 25,000', 'household_income'
] = 'Less than 25,000'
household_income_freqs.loc[
    household_income_freqs.household_income == 'more than 200,000', 'household_income'
] = 'More than 200,000'

## Get mode household_income-group
household_income_freqs_mode_household_income = household_income_freqs.loc[0, 'household_income']
household_income_freqs_mode_household_income_freq = household_income_freqs.loc[0, 'freq']

## Get anti-mode household_income-group
household_income_freqs_antimode_household_income = household_income_freqs.loc[len(household_income_freqs)-1, 'household_income']
household_income_freqs_antimode_household_income_freq = household_income_freqs.loc[len(household_income_freqs)-1, 'freq']

## Plot household_income histogram
household_income_freqs_chart = px.bar(
    household_income_freqs, 
    x='household_income', y='freq', title='Number of Respondents by Household Income Group',
    labels=dict(household_income='Household Income Group', freq='Frequency'),
    width=1000, height=500
)