# Grown Strong - March 2021 Survey Analysis
2021-03-30

## Introduction
 - Talk about the project/task
 - Discuss TwoKai's collaboration
 - Infrsstructure and tools used

In [572]:
# Load packages

## General packages
import pandas as pd
import numpy as np
from scipy import stats
import os
import random

## Data vis packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [573]:
# Read in data
## Get directory and file paths
input_dir = '../input/'
df_path   = os.path.join(input_dir, '2021_03_22_grown_strong_survey_results.csv')

## Read in survey data
df = pd.read_csv(
    df_path,
    ## Rename columns
    header = 0,
    names = [
        'response_id', 'name', 'email_address', 'age', 'gender', 'ethnicity', 'home_location', 
        'household_income', 'num_work_week_hrs', 'has_children', 'fitness_level', 'nutrition_level', 
        'olympic_lifting_experience', 'fitness_goals0', 'fitness_goals1', 'fitness_goals2', 'fitness_goals3', 
        'fitness_goals4', 'fitness_goals5', 'fitness_goals6', 'fitness_goals7', 'fitness_goals8', 
        'most_used_gs_program', 'num_gs_sessions_per_week', 'workout_location', 'uses_other_workouts',  
        'uses_other_workouts_further_info', 'gs_provision_suggestion', 'has_joined_facebook_group',  
        'gs_improvement_suggestion1', 'why_not_joined_facebook_group', 'gs_likely_recommendation',  
        'gs_improvement_suggestion2', 'unnamed'
    ]
)

## Remove last column as unneeded
df = df.iloc[:, 0:-1]

In [574]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 33 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   response_id                       147 non-null    int64  
 1   name                              147 non-null    object 
 2   email_address                     147 non-null    object 
 3   age                               147 non-null    object 
 4   gender                            146 non-null    object 
 5   ethnicity                         147 non-null    object 
 6   home_location                     147 non-null    object 
 7   household_income                  147 non-null    object 
 8   num_work_week_hrs                 147 non-null    object 
 9   has_children                      147 non-null    object 
 10  fitness_level                     147 non-null    int64  
 11  nutrition_level                   147 non-null    int64  
 12  olympic_

In [575]:
# Clean name

## Convert to lowercase
df['name'] = df['name'].str.lower()
## Remove punctuation
df['name'] = df['name'].str.replace(r'[^\w\s]+', '', regex = True)

In [576]:
# Clean email_address

## Convert to lowercase
df['email_address'] = df['email_address'].str.lower().str.strip()

def extract_email_domain(string):
    domain = string.split('@')[1].split('.')[0]
    return(domain)

# Extract email domain into another feature
df['email_domain'] = df['email_address'].apply(extract_email_domain).str.strip()

In [577]:
# Clean age

## Remove unneeded substring and whitespace
df['age'] = df['age'].str.replace('years old', '').str.strip()

In [578]:
# Clean gender

## Convert to lowercase
df['gender'] = df['gender'].str.lower()
## Map female and male to f and m respectively
df['gender'] = df['gender'].str.replace('female', 'f', regex = False)
df['gender'] = df['gender'].str.replace('male', 'm', regex = False)
## Map 'prefer not to answer' to NA
df.loc[df.gender == 'prefer not to answer', 'gender'] = np.nan

In [579]:
# Clean ethnicity

## Convert to lowercase
df['ethnicity'] = df['ethnicity'].str.lower()
## Map '/' to 'or'
df['ethnicity'] = df['ethnicity'].str.replace('/', ' or ', regex = False)
## Map 'prefer not to answer' to NA
df.loc[df.ethnicity == 'prefer not to answer', 'ethnicity'] = np.nan

In [580]:
# Clean home_location

## Convert to lowercase
df['home_location'] = df['home_location'].str.lower()
## Map all 'other' values to only 'other'
df.loc[df.home_location.str.contains('other'), 'home_location'] = 'other'

In [581]:
# Clean household_income

## Convert to lowercase
df['household_income'] = df['household_income'].str.lower()
## Remove $ symbols
df['household_income'] = df['household_income'].str.replace('$', '', regex = False)
## Map 'prefer not to answer' to NA
df.loc[df.household_income == 'prefer not to answer', 'household_income'] = np.nan

In [582]:
# Clean num_work_week_hrs

## Convert to lowercase
df['num_work_week_hrs'] = df['num_work_week_hrs'].str.lower()
## Clean 'to' and '-' symbols so consistent
df['num_work_week_hrs'] = df['num_work_week_hrs'].str.replace(' to ', '-', regex = False)
## Map 'currently not working' to 0
df.loc[df.num_work_week_hrs == 'currently not working', 'num_work_week_hrs'] = 0
## Clean '0 to 25 hours a week' values
df.loc[df.num_work_week_hrs == '0 to 25 hours a week', 'num_work_week_hrs'] = '1 to 25 hours a week'
## Remove 'hours a week'
df['num_work_week_hrs'] = df['num_work_week_hrs'].str.replace('hours a week', '').str.strip()

In [583]:
# Clean has_children

## Convert strings to booleans
df.loc[df.has_children == 'Yes', 'has_children'] = True
df.loc[df.has_children == 'No', 'has_children'] = False

In [584]:
# Clean fitness goals cols

## Get fitness_goals colnames
cols_fitness_goals = df.columns[df.columns.str.startswith('fitness_goals')]

## Concatenate fitness goals cols using concat_cols
for idx, col_current in enumerate(cols_fitness_goals):
    if idx == 0:
        col_previous = cols_fitness_goals[0]
    
    ## Remove 'other (please specify):' string
    df[col_current] = df[col_current].str.lower()
    df[col_current] = df[col_current].str.replace('other (please specify):', '', regex = False)
    
    # If column is free text field, replace commas with periods
    if idx == len(cols_fitness_goals):
            df[col_current] = df[col_current].str.replace(',', '.', regex = False)
    
    ## Convert to string and concatenate
    df['fitness_goals'] = df[col_previous].astype('str') + ', ' + df[col_current].astype('str')
    
    ## Remove any nans included in concatenation
    df['fitness_goals'] = df['fitness_goals'].str.replace('nan, ', '', regex = False)
    df['fitness_goals'] = df['fitness_goals'].str.replace(', nan', '', regex = False)
    df['fitness_goals'] = df['fitness_goals'].str.replace('nan', '', regex = False)
    df['fitness_goals'] = df['fitness_goals'].str.replace(r'^, ', '', regex = True)
    
    col_previous = 'fitness_goals'

In [585]:
# Clean num_gs_sessions_per_week

## Remove 'times a week' string
df['num_gs_sessions_per_week'] = df['num_gs_sessions_per_week'].str.replace(r'time.? a week', '', regex = True)
## Replace ' to ' with hyphen for consistency
df['num_gs_sessions_per_week'] = df['num_gs_sessions_per_week'].str.replace(' to ', '-', regex = False)
df['num_gs_sessions_per_week'] = df['num_gs_sessions_per_week'].str.strip()

In [586]:
# Clean workout_location

## Map '/' to 'or'
df['workout_location'] = df['workout_location'].str.replace('/', ' or ', regex = False)
## Remove 'other (please specify):' string
df['workout_location'] = df['workout_location'].str.lower()
df['workout_location'] = df['workout_location'].str.replace('other (please specify):', '', regex = False)

## Manual cleaning of values
df['workout_location'] = df['workout_location'].str.replace('home gym- inside', 'inside house', regex = False)
df['workout_location'] = df['workout_location'].str.replace('boat', 'inside house', regex = False)

In [587]:
# Clean uses_other_workouts

## Convert strings to booleans
df.loc[df.uses_other_workouts == 'Yes', 'uses_other_workouts'] = True
df.loc[df.uses_other_workouts == 'No', 'uses_other_workouts'] = False

In [588]:
# Clean uses_other_workouts_further_info

def clean_free_text(data, col):
    ## Convert to lowercase
    data[col] = data[col].str.lower()
    ## Clean punctuation and abbreviations
    data[col] = data[col].str.replace("i’ll", 'i will', regex = False)
    data[col] = data[col].str.replace("i'll", 'i will', regex = False)
    data[col] = data[col].str.replace("i’m", 'i am', regex = False)
    data[col] = data[col].str.replace("i'm", 'i am', regex = False)
    data[col] = data[col].str.replace("it’s", 'it is', regex = False)
    data[col] = data[col].str.replace("it's", 'it is', regex = False)
    data[col] = data[col].str.replace("hitt", 'hiit', regex = False)
    data[col] = data[col].str.replace("can’t", 'can not', regex = False)
    data[col] = data[col].str.replace("can't", 'can not', regex = False)
    data[col] = data[col].str.replace("don’t", 'do not', regex = False)
    data[col] = data[col].str.replace("don't", 'do not', regex = False)
    data[col] = data[col].str.replace("isn’t", 'is not', regex = False)
    data[col] = data[col].str.replace("isn't", 'is not', regex = False)
    data[col] = data[col].str.replace(r'[^\w\s]+', ' ', regex = True)
    ## Strip whitespace
    data[col] = data[col].str.replace('\n', ' ', regex = False)
    data[col] = data[col].str.replace('\s+', ' ', regex = True)
    data[col] = data[col].str.strip()
    data.loc[data[col] == '', col] = np.nan
    return(data[col])

df['uses_other_workouts_further_info'] = clean_free_text(df, 'uses_other_workouts_further_info')

In [589]:
# Clean gs_provision_suggestion

df['gs_provision_suggestion'] = clean_free_text(df, 'gs_provision_suggestion')

In [590]:
# Clean has_joined_facebook_group

## Convert strings to booleans
df.loc[df.has_joined_facebook_group == 'Yes', 'has_joined_facebook_group'] = True
df.loc[df.has_joined_facebook_group == 'No', 'has_joined_facebook_group'] = False
df.loc[df.has_joined_facebook_group == 'I do not have Facebook', 'has_joined_facebook_group'] = np.nan

In [593]:
# Clean gs_improvement_suggestion1

df['gs_improvement_suggestion1'] = clean_free_text(df, 'gs_improvement_suggestion1')
df['gs_improvement_suggestion1'].unique()

array([nan, 'no i enjoy the girls and everyone s encouragement',
       'i do not ever look at facebook',
       'i love everything no improvements needed',
       'nope i love it one of my favorite things about the group',
       'nothing i can think of i enjoy hearing from other members and gs',
       'i do not think so', 'everything has been great',
       'more technique help', 'no it is great',
       'i love it it is such a positive environment',
       'no i love the support and commerodery of the group',
       'it is great right now i cna t think of any improvement',
       'i think it seems a great community very supportive positive and encouraging',
       'the group is perfect you guys are great at responding and adding feedback',
       'no i should probably reach out to the community more often via the facebook group',
       'i really loved the snatch demo and snatch warm up it really helps that community feeling when you guys do iives and mini workshops like that',
   