# This notebook prepares data for modeling

## 1. Load packages & helper functions

In [221]:
import os
import pandas as pd
import numpy as np
import math
from datetime import datetime

def get_payroll_file_name(year, month):
    file_name = f"{month_names[month]}{year}"
    file_path = f"../data/raw/{file_name}.xlsx"
    return file_path


def load_file_excel(file_path, sheet_name=0):
    if not os.path.exists(file_path):
        return None
    
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    return df

month_names = {
    1:"Jan",
    2:"Feb",
    3:"March",
    4:"April",
    5:"May",
    6:"June",
    7:"July",
    8:"Aug",
    9:"Sept",
    10:"Oct",
    11:"Nov",
    12:"Dec",
}

## 2. Clean & merge payroll data

In [263]:
 # Keep only needed columns
def clean_payroll_data(df, year, month):
    df = df[["EECode", "EarnHours", "EarnRate",	"EarnAmount","Dist Position Desc"	]]
    df.is_copy = None
    df["Year"] = year
    df["Month"] = month
    return df


def merge_payroll_data():
    result = None
    for year in [2020, 2021, 2022]:
        for month in range(1, 13):
            name = get_payroll_file_name(year, month)
            df = load_file_excel(name)
            if df is not None:
                df = clean_payroll_data(df, year, month)
                if result is None:
                    result = df
                else:
                    result = pd.concat([result, df])
    return result

merged = merge_payroll_data()
merged.to_csv("../data/raw/merged_payroll.csv", index=False)
merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Month"] = month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the do

Unnamed: 0,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,938,12.19,8.56,104.35,WILD Steward,2020,12
1,938,14.69,8.80,129.28,WILD Steward,2020,12
2,939,32.73,8.56,280.17,WILD Steward,2020,12
3,939,24.00,8.80,211.20,WILD Steward,2020,12
4,1160,33.90,8.56,290.18,WILD Steward,2020,12
...,...,...,...,...,...,...,...
15,1483,55.83,13.00,725.79,WILD Steward,2022,12
16,1484,41.18,13.00,535.34,WILD Steward,2022,12
17,1423,98.43,23.89,2351.52,Adult Education Specialist,2022,12
18,1485,64.63,13.00,840.19,WILD Steward,2022,12


## 3. Merge applicant data

In [264]:
df_list = load_file_excel(f"../data/raw/WILD Applicant Data (2016-2022).xlsx", sheet_name=[-1,-2,-3])
applicant_2022 = df_list[-1]
applicant_2021 = df_list[-2]
applicant_2020 = df_list[-3]

applicant_2022 = applicant_2022.rename(columns = {
'Date of Birth (MM/DD/YYYY)':'Date of Birth',
'Current School (2021-2022)':'School',
'School for the 2022-2023 school year': 'Next Year School',
applicant_2022.columns[6]:'Zipcode',
applicant_2022.columns[7]:'Program',
applicant_2022.columns[8]:'First Paying Job',
applicant_2022.columns[9]:'Know anyone who has been employeed',
applicant_2022.columns[10]:'Applied Before',
applicant_2022.columns[11]:'Speaking in front of a crowd rate',
applicant_2022.columns[12]:'Unconfortable to work with',
applicant_2022.columns[13]:'Extra activities',
applicant_2022.columns[14]:'Commit Summer',
applicant_2022.columns[16]:'Commit Weekday',
applicant_2022.columns[18]:'Hear about source'
})

applicant_2022 = applicant_2022[['ID Number', 'Preferred Pronouns', 'Date of Birth', 'School',
       'Current Grade Level', 'Next Year School', 'Zipcode', 'Program',
       'First Paying Job', 'Know anyone who has been employeed',
       'Applied Before', 'Speaking in front of a crowd rate',
       'Unconfortable to work with', 'Extra activities', 'Commit Summer','Commit Weekday',
       'Hear about source']]

applicant_2022['Application Year'] = 2022


applicant_2021 = applicant_2021.rename(columns = {
'What is your preferred gender pronoun?':'Preferred Pronouns',
'Date of Birth (ie: 01/01/1999)':'Date of Birth',
'Current Grade level (i.e. Freshman, Sophomore, Junior, Senior)':'Current Grade Level',
'Current School (2021-2022)':'School',
'How did you hear about us?':'Hear about source',
applicant_2021.columns[6]: 'Hear about source other',
'School for the 2022-2023 school year': 'Next Year School',
'What Zip Code do you live in?':'Zipcode',
applicant_2021.columns[10]:'Program',
applicant_2021.columns[11]:'First Paying Job',
applicant_2021.columns[13]:'Know anyone who has been employeed',
applicant_2021.columns[14]:'Applied Before',
applicant_2021.columns[16]:'Unconfortable to work with',
applicant_2021.columns[17]:'Commit Summer',
applicant_2021.columns[19]:'Commit Weekday',
applicant_2021.columns[21]:'Speaking in front of a crowd rate',

})

# get from hear about source other
applicant_2021['Hear about source'] = np.where(applicant_2021['Hear about source'] == 'Other', applicant_2021['Hear about source other'], applicant_2021['Hear about source'])

applicant_2021 = applicant_2021[['ID Number', 'Preferred Pronouns', 'Date of Birth','Current Age', 'School',
       'Current Grade Level',  'Zipcode', 'Program',
       'First Paying Job', 'Know anyone who has been employeed',
       'Applied Before', 'Speaking in front of a crowd rate',
       'Unconfortable to work with',  'Commit Summer',
    'Commit Weekday',
       'Hear about source']]

applicant_2021['Next Year School'] = None
applicant_2021['Extra activities'] = None
applicant_2021['Application Year'] = 2021



applicant_2020 = applicant_2020.rename(columns = {
    'What is your preferred gender pronoun?':'Preferred Pronouns',
    applicant_2020.columns[2]:'Preferred Pronouns Alter 1',
    applicant_2020.columns[3]:'Preferred Pronouns Alter 2',
    'Date of Birth (ie: 01/01/1999)':'Date of Birth',
    'How did you hear about us?':'Hear about source',
    applicant_2020.columns[9]: 'Hear about source other',
    applicant_2020.columns[10]: 'Hear about source other_1',
    applicant_2020.columns[11]: 'Hear about source other_2',
    applicant_2020.columns[12]: 'Hear about source other_3',
    applicant_2020.columns[13]: 'Hear about source other_4',
    applicant_2020.columns[14]: 'Hear about source other_5',
    applicant_2020.columns[15]: 'Hear about source other_6',
    applicant_2020.columns[16]: 'Current Grade Level',
    applicant_2020.columns[17]: 'Zipcode',
    applicant_2020.columns[20]:  'Program',
    applicant_2020.columns[21]: 'First Paying Job',
    applicant_2020.columns[24]:  'Know anyone who has been employeed',
    applicant_2020.columns[26]: 'Applied Before',
    applicant_2020.columns[29]: 'Unconfortable to work with',
    applicant_2020.columns[30]: 'Unconfortable to work with 1',
    applicant_2020.columns[31]: 'Unconfortable to work with 2',
    applicant_2020.columns[32]: 'Unconfortable to work with 3',
    applicant_2020.columns[33]: 'Unconfortable to work with 4',
    applicant_2020.columns[34]:'Commit Summer',
    applicant_2020.columns[38]: 'Commit Weekday',
    applicant_2020.columns[42]: 'Speaking in front of a crowd rate',
})


# todo, get the data from the alter sources
applicant_2020['Preferred Pronouns'] = np.where(pd.isnull(applicant_2020['Preferred Pronouns']), applicant_2020['Preferred Pronouns Alter 1'], applicant_2020['Preferred Pronouns'])
applicant_2020['Preferred Pronouns'] = np.where(pd.isnull(applicant_2020['Preferred Pronouns']), applicant_2020['Preferred Pronouns Alter 2'], applicant_2020['Preferred Pronouns'])


applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other'], applicant_2020['Hear about source'])
applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other_1'], applicant_2020['Hear about source'])
applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other_2'], applicant_2020['Hear about source'])
applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other_3'], applicant_2020['Hear about source'])
applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other_4'], applicant_2020['Hear about source'])
applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other_5'], applicant_2020['Hear about source'])
applicant_2020['Hear about source'] = np.where(pd.isnull(applicant_2020['Hear about source']), applicant_2020['Hear about source other_6'], applicant_2020['Hear about source'])

applicant_2020['Unconfortable to work with'] = np.where(pd.isnull(applicant_2020['Unconfortable to work with']), applicant_2020['Unconfortable to work with 1'], applicant_2020['Unconfortable to work with'])
applicant_2020['Unconfortable to work with'] = np.where(pd.isnull(applicant_2020['Unconfortable to work with']), applicant_2020['Unconfortable to work with 2'], applicant_2020['Unconfortable to work with'])
applicant_2020['Unconfortable to work with'] = np.where(pd.isnull(applicant_2020['Unconfortable to work with']), applicant_2020['Unconfortable to work with 3'], applicant_2020['Unconfortable to work with'])
applicant_2020['Unconfortable to work with'] = np.where(pd.isnull(applicant_2020['Unconfortable to work with']), applicant_2020['Unconfortable to work with 4'], applicant_2020['Unconfortable to work with'])


applicant_2020 = applicant_2020[['ID Number', 'Preferred Pronouns', 'Date of Birth','Current Age', 'School',
       'Current Grade Level',  'Zipcode', 'Program',
       'First Paying Job', 'Know anyone who has been employeed',
       'Applied Before', 'Speaking in front of a crowd rate',
       'Unconfortable to work with',  'Commit Summer',
    'Commit Weekday',
       'Hear about source']]

applicant_2020['Next Year School'] = None
applicant_2020['Extra activities'] = None
applicant_2020['Application Year'] = 2020

def is_float(v):
       try:
              f = float(v)
              return True
       except Exception as ex:
              print(ex)
              return False

# merge files
applicant_data = pd.concat([applicant_2022, applicant_2021,applicant_2020])

# drop erroneous row
applicant_data = applicant_data[applicant_data['ID Number'] != '\xa0'].reset_index(drop = True)

# flag rejected candidates
applicant_data['Rejected'] = applicant_data['ID Number'].apply(lambda x: math.isnan(x))

# convert existing IDs to int
applicant_data.loc[applicant_data['Rejected'] == False, 'ID Number'] = applicant_data.loc[applicant_data['Rejected'] == False, 'ID Number'].astype(int)

#save
applicant_data.to_csv("../data/raw/applicants.csv", index=False)
applicant_data


Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,School,Current Grade Level,Next Year School,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,Unconfortable to work with,Extra activities,Commit Summer,Commit Weekday,Hear about source,Application Year,Current Age,Rejected
0,1489,He/Him/His (masculine pronouns),2006-12-13 00:00:00,First Coast High School,9th,First Coast High School,32218,Zoocamp,Yes,,,Very comfortable! I can confidently speak to a...,"Invertebrates (roaches, spiders, millipedes)",Nothing at this time,Yes,Yes,Jacksonville Zoo & Gardens website,2022,,False
1,,He/Him/His (masculine pronouns),2004-11-15 00:00:00,Jean Ribault High School,11th,Jean Ribault High School,,No,No,,No,"I can do it. I mean, I would rather speak to s...","Invertebrates (roaches, spiders, millipedes)",,Not Sure,Yes,Jamie Lankenau,2022,,True
2,1488,He/Him/His (masculine pronouns),2004-11-15 00:00:00,Jean Ribault High School,11th,Jean Ribault High School,,No,No,,,"I can do it. I mean, I would rather speak to s...","Invertebrates (roaches, spiders, millipedes)","I run track at Jean Ribault High School, I als...",Yes,Yes,Friend,2022,,False
3,,He/Him/His (masculine pronouns),2006-09-25 00:00:00,Sandalwood senior high school,9,Sandalwood senior high school,32206,No,No,,No,"I can do it. I mean, I would rather speak to s...",I am open to working with any of these animals,"Yes, Football wrestling and lacrosse",Yes,Yes,Vendor,2022,,True
4,,He/Him/His (masculine pronouns),2006-06-22 00:00:00,Andrew Jackson senior high school,10th,Andrew Jackson senior high school,32206,No,No,,,"I can do it, but I don't like it. It's tough f...",I am open to working with any of these animals,Football and soccer,Not Sure,Not Sure,Park vender,2022,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,,She/Her/Hers,2003-08-13 00:00:00,William m raines,11 junior,,32206,,,Yes,,,"Reptiles (snakes, lizards, turtles)",,Yes,Yes,School Career Fair,2020,16.0,True
314,,He/Him/His,2004-08-07 00:00:00,Edward H. White Senior High School,Sophomore,,32218,,Yes,,,Very comfortable! I can confidently speak to a...,I am open to working with any of these animals.,,Yes,Yes,Jacksonville Zoo and Gardens Website,2020,15.0,True
315,,She/Her/Hers,2004-08-05 00:00:00,Andrew Jackson High School,Sophmore,,32218,,Yes,,,Very comfortable! I can confidently speak to a...,"Invertebrates (roaches, spiders, millipedes)",,,Yes,School Career Fair,2020,15.0,True
316,,She/Her/Hers,2004-07-11 00:00:00,Andrew Jackson High School,sophomore,,32218,,Yes,Yes,,,"Invertebrates (roaches, spiders, millipedes)",,Yes,Yes,School Career Fair,2020,15.0,True


## Clean fields in merged applicant data

For each column in this section we:
1) check the unique values in the free response columns

2) map the common free responses to a standardized set of values

In [265]:
applicant_data = pd.read_csv("../data/raw/applicants.csv")

In [266]:
applicant_data['Preferred Pronouns'].unique()

array(['He/Him/His (masculine pronouns)',
       'She/Her/Hers (feminine pronouns)',
       'They/Them/Theirs (neutral pronouns)', 'She/They', nan,
       'Alll pronouns', 'She/Her/Hers', 'He/Him/His', 'They/Them/Their'],
      dtype=object)

In [267]:
applicant_data.loc[applicant_data['Preferred Pronouns'].str.contains(
    '(?i)masculine|him')==True, 'Preferred Pronouns'] = 'masculine'

applicant_data.loc[applicant_data['Preferred Pronouns'].str.contains(
    '(?i)feminine|she')==True, 'Preferred Pronouns'] = 'feminine'

applicant_data.loc[applicant_data['Preferred Pronouns'].str.contains(
    '(?i)neutral|they')==True, 'Preferred Pronouns'] = 'neutral'

applicant_data.loc[applicant_data['Preferred Pronouns'].str.contains(
    '(?i)masculine|feminine|neutral')==False, 'Preferred Pronouns'] = np.nan

applicant_data['Preferred Pronouns'].unique()

array(['masculine', 'feminine', 'neutral', nan], dtype=object)

In [268]:
applicant_data['Current Grade Level'].unique()

array(['9th', '11th', '9', '10th', '10', '11', '9th ', '8th', '8',
       '11th Grade', '12th', '10th ', '8th ', '12', '9th grade',
       '9th Grade', '12th ', 'Freshman', 'Sophomore', '8th Grade',
       'freshman', 'Sophomore ', 'I’m in 8th going into 9th/freshman ',
       '10th sophomore ', 'Grad ', 'Freshman ', 'Junior', 'senior ',
       'Juniod', 'Freshmen', nan, 'sophomore', 'UW', 'Freshman (9th)',
       'Senior', '8th grade', '10th grade sophomore ', 'Junior ',
       'Freshman/9th', 'junior', 'freshmen', 'Sophmore ',
       '10th(Sophomore)', '9th Freshman', 'sophomore ', '11tj ',
       '8th grade ', 'sophmore', 'Senior ', 'Sophmore', 'FreshmaN',
       'junior ', 'Sophomore (Early College)', 'Eighth', '11 junior'],
      dtype=object)

In [269]:
applicant_data.loc[applicant_data['Current Grade Level'].str.contains(
    '(?i)8|eigh')==True, 'Current Grade Level'] = '8'

applicant_data.loc[applicant_data['Current Grade Level'].str.contains(
    '(?i)9|fresh')==True, 'Current Grade Level'] = '9'

applicant_data.loc[applicant_data['Current Grade Level'].str.contains(
    '(?i)10|soph')==True, 'Current Grade Level'] = '10'

applicant_data.loc[applicant_data['Current Grade Level'].str.contains(
    '(?i)11|jun')==True, 'Current Grade Level'] = '11'

applicant_data.loc[applicant_data['Current Grade Level'].str.contains(
    '(?i)12|sen')==True, 'Current Grade Level'] = '12'

applicant_data.loc[applicant_data['Current Grade Level'].str.contains(
    '(?i)8|9|10|11|12')==False, 'Current Grade Level'] = np.nan

applicant_data['Current Grade Level'].unique()

array(['9', '11', '10', '8', '12', nan], dtype=object)

In [270]:
applicant_data['Program'].unique()

array(['Zoocamp', 'No', 'Other', 'Home School', nan, 'Zoo Camp', 'school',
       'Parent Academy', 'Homeschool ', 'If so, which one(s)?',
       'Yes I attended zoo camp back in 6th grade.'], dtype=object)

In [271]:
applicant_data.loc[applicant_data['Program'].str.contains(
    '(?i)camp')==True, 'Program'] = 'zoocamp'

applicant_data.loc[applicant_data['Program'].str.contains(
    '(?i)home')==True, 'Program'] = 'homeschool'

applicant_data.loc[applicant_data['Program'].str.contains(
    '(?i)home|camp')==False, 'Program'] = 'other'

applicant_data['Program'].unique()

array(['zoocamp', 'other', 'homeschool', nan], dtype=object)

In [272]:
applicant_data['Know anyone who has been employeed'].unique()

array([nan, 'Yes', 'N/a', 'yes', 'no', 'No'], dtype=object)

In [273]:
applicant_data.loc[applicant_data['Know anyone who has been employeed'].str.contains(
    '(?i)yes')==True, 'Know anyone who has been employeed'] = 'yes'

applicant_data.loc[applicant_data['Know anyone who has been employeed'].str.contains(
    '(?i)yes')==False, 'Know anyone who has been employeed'] = 'no'

applicant_data['Know anyone who has been employeed'].unique()

array([nan, 'yes', 'no'], dtype=object)

In [274]:
applicant_data['Applied Before'].unique()

array([nan, 'No', 'Im currently a ZooTeen! Also applied to WILD in 2021.',
       'The W.I.L.D. Program in September 2021', 'no',
       'Yes. I applied for the W.I.L.D program earlier this year.', 'no ',
       'No.', 'Yes, W.I.L.D. Program earlier in the year of 2021', 'No ',
       'I have not', 'w.i.l.d program ',
       'No, but I wish I had! This seems like an amazing opportunity, and I hope to have the chance to learn more. ',
       'yes ', 'N/a', 'W.I.L.D program  Last year ',
       'Yes, but i changed my number so i resubmit my application with a new number. it was for this exact progra. a few days ago',
       'Yes I applied to the W.I.L.D program last year (2021).',
       'Neither', 'Yes', ' - Yes'], dtype=object)

In [275]:
applicant_data.loc[applicant_data['Applied Before'].str.contains(
    '(?i)yes|last|applied|program')==True, 'Applied Before'] = 'yes'

applicant_data.loc[applicant_data['Applied Before'].str.contains(
    '(?i)no')==True, 'Applied Before'] = 'no'

applicant_data.loc[applicant_data['Applied Before'].str.contains(
    '(?i)yes|no')==False, 'Applied Before'] = 'no'

applicant_data['Applied Before'].unique()

array([nan, 'no', 'yes'], dtype=object)

In [276]:
applicant_data['Speaking in front of a crowd rate'].unique()

array(['Very comfortable! I can confidently speak to and interact with any large group.',
       'I can do it. I mean, I would rather speak to smaller groups instead of a large group.',
       "I can do it, but I don't like it. It's tough for me and I will need extra time to prepare.",
       'I avoid it at all costs.',
       'I can do it.  I mean, I would rather speak to multiple smaller groups instead of one large group.',
       "I do it, but don't like it. I mean, it's tough and I will need extra time to prepare.",
       "I avoid it. Yea, that just ain't my thing.", nan], dtype=object)

In [277]:
applicant_data.loc[applicant_data['Speaking in front of a crowd rate'].str.contains(
    '(?i)confidently|can')==True, 'Speaking in front of a crowd rate'] = 'comfortable'

applicant_data.loc[applicant_data['Speaking in front of a crowd rate'].str.contains(
    '(?i)tough|avoid')==True, 'Speaking in front of a crowd rate'] = 'uncomfortable'

applicant_data['Speaking in front of a crowd rate'].unique()

array(['comfortable', 'uncomfortable', nan], dtype=object)

In [278]:
applicant_data['Unconfortable to work with'].unique()

array(['Invertebrates (roaches, spiders, millipedes)',
       'I am open to working with any of these animals',
       'Reptiles (snakes, lizards, turtles)',
       'Mammals (rodents, armadillos, skunks)',
       'Birds (parrots, owls, doves)',
       'I am open to working with any of these animals.',
       'Birds (owls, doves, hawks)', nan,
       ' - Reptiles (snakes, lizards, turtles)'], dtype=object)

In [279]:
applicant_data.loc[applicant_data['Unconfortable to work with'].str.contains(
    '(?i)invertebrates')==True, 'Unconfortable to work with'] = 'invertebrates'

applicant_data.loc[applicant_data['Unconfortable to work with'].str.contains(
    '(?i)reptiles')==True, 'Unconfortable to work with'] = 'reptiles'

applicant_data.loc[applicant_data['Unconfortable to work with'].str.contains(
    '(?i)mammals')==True, 'Unconfortable to work with'] = 'mammals'

applicant_data.loc[applicant_data['Unconfortable to work with'].str.contains(
    '(?i)birds')==True, 'Unconfortable to work with'] = 'birds'

applicant_data.loc[applicant_data['Unconfortable to work with'].str.contains(
    '(?i)any')==True, 'Unconfortable to work with'] = 'none'

applicant_data['Unconfortable to work with'].unique()

array(['invertebrates', 'none', 'reptiles', 'mammals', 'birds', nan],
      dtype=object)

In [280]:
applicant_data['Extra activities'].unique()

array(['Nothing at this time', nan,
       'I run track at Jean Ribault High School, I also played soccer for them as well this past season as well. I also ran cross country(xc). So pretty much sports.',
       'Yes, Football wrestling and lacrosse ', 'Football and soccer',
       'The debate society, Black Student Union, Football, Wrestling, and Track',
       'I am involved in the Cross Country team at my school, although it is out of season right now. I also am a part of the Chinese, History, and National Honor societies at my school and I am a member of the Ethics Bowl and Mock Trial teams. Throughout the school year, I have been volunteering with the Communities Without Borders program with which I go to a local elementary school and work with the Fifth graders to create a social justice related project of their choosing which they will share with other Elementary schools.',
       'Girl Scouts', 'No', 'Volleyball, Girl Scouts', 'ZooTeens!',
       'I play soccer outside of school

In [281]:
applicant_data.loc[applicant_data['Extra activities'].str.contains(
    '(?i)nothing|none|no|N/a')==True, 'Extra activities'] = 'none'

applicant_data.loc[applicant_data['Extra activities'].str.contains(
    '(?i)none')==False, 'Extra activities'] = 'some'

applicant_data['Extra activities'].unique()

array(['none', nan, 'some'], dtype=object)

In [282]:
applicant_data['Commit Summer'].unique()

array(['Yes', 'Not Sure', nan, ' - Yes'], dtype=object)

In [283]:
applicant_data.loc[applicant_data['Commit Summer'].str.contains(
    '(?i)yes')==True, 'Commit Summer'] = 'yes'

applicant_data.loc[applicant_data['Commit Summer'].str.contains(
    '(?i)not')==True, 'Commit Summer'] = 'unsure'

applicant_data.loc[applicant_data['Commit Summer'].str.contains(
    '(?i)no')==True, 'Commit Summer'] = 'no'

applicant_data['Commit Summer'].unique()

array(['yes', 'unsure', nan], dtype=object)

In [284]:
applicant_data['Commit Weekday'].unique()

array(['Yes', 'Not Sure', 'No', ' - Yes', nan], dtype=object)

In [285]:
applicant_data.loc[applicant_data['Commit Weekday'].str.contains(
    '(?i)yes')==True, 'Commit Weekday'] = 'yes'

applicant_data.loc[applicant_data['Commit Weekday'].str.contains(
    '(?i)not')==True, 'Commit Weekday'] = 'unsure'

applicant_data.loc[applicant_data['Commit Weekday'].str.contains(
    '(?i)no')==True, 'Commit Weekday'] = 'no'

applicant_data['Commit Weekday'].unique()

array(['yes', 'unsure', 'no', nan], dtype=object)

In [286]:
applicant_data['Hear about source'].unique()

array(['Jacksonville Zoo & Gardens website', 'Jamie Lankenau', 'Friend',
       'Vendor', 'Park vender ', 'School Career Fair',
       'Volunteer at Jacksonville Zoo & Gardens',
       'Social Media (Facebook, Snapchat, Twitter, etc.)',
       'Teacher or Counselor',
       'Night Hike Tour Guide Spoke To Me About Program ', 'Sister',
       'A Friend', 'Teacher/Counselor',
       'Social Media: Facebook, SnapChat, Instagram,Twitter',
       'Jacksonville Zoo and Gardens Website', 'Poster in Hall',
       'School Advertisement ', 'Paint and Hiss event', 'Godfather',
       'Church ', 'My mom', 'Job Board', nan, 'Other', 'FSCJ Career Fair'],
      dtype=object)

In [287]:
applicant_data.loc[applicant_data['Hear about source'].str.contains(
    '(?i)parent|sister|mom|dad')==True, 'Hear about source'] = 'family'

applicant_data.loc[applicant_data['Hear about source'].str.contains(
    '(?i)friend')==True, 'Hear about source'] = 'friend'

applicant_data.loc[applicant_data['Hear about source'].str.contains(
    '(?i)social')==True, 'Hear about source'] = 'social media'

applicant_data.loc[applicant_data['Hear about source'].str.contains(
    '(?i)school|teacher')==True, 'Hear about source'] = 'school'

applicant_data.loc[applicant_data['Hear about source'].str.contains(
    '(?i)family|friend|social|school')==False, 'Hear about source'] = 'other'

applicant_data['Hear about source'].unique()

array(['other', 'friend', 'school', 'social media', 'family', nan],
      dtype=object)

In [288]:
applicant_data['Application Year'].unique()

array([2022, 2021, 2020], dtype=int64)

In [289]:
applicant_data['Current Age'].unique()

array([nan, 14., 15., 16., 13., 17., 18., 24., 19., 12.])

In [290]:
#drop cols we won't be able to use
applicant_data = applicant_data.drop('School', axis=1)
applicant_data = applicant_data.drop('Next Year School', axis=1)#save

#save
applicant_data.to_csv("../data/raw/applicants_clean.csv", index=False)
applicant_data

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,Unconfortable to work with,Extra activities,Commit Summer,Commit Weekday,Hear about source,Application Year,Current Age,Rejected
0,1489.0,masculine,2006-12-13 00:00:00,9,32218,zoocamp,Yes,,,comfortable,invertebrates,none,yes,yes,other,2022,,False
1,,masculine,2004-11-15 00:00:00,11,,other,No,,no,comfortable,invertebrates,,unsure,yes,other,2022,,True
2,1488.0,masculine,2004-11-15 00:00:00,11,,other,No,,,comfortable,invertebrates,some,yes,yes,friend,2022,,False
3,,masculine,2006-09-25 00:00:00,9,32206,other,No,,no,comfortable,none,some,yes,yes,other,2022,,True
4,,masculine,2006-06-22 00:00:00,10,32206,other,No,,,comfortable,none,some,unsure,unsure,other,2022,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,,feminine,2003-08-13 00:00:00,11,32206,,,yes,,,reptiles,,yes,yes,school,2020,16.0,True
314,,masculine,2004-08-07 00:00:00,10,32218,,Yes,,,comfortable,none,,yes,yes,other,2020,15.0,True
315,,feminine,2004-08-05 00:00:00,10,32218,,Yes,,,comfortable,invertebrates,,,yes,school,2020,15.0,True
316,,feminine,2004-07-11 00:00:00,10,32218,,Yes,yes,,,invertebrates,,yes,yes,school,2020,15.0,True


## Merge the applicant & payroll datasets

In [291]:
applicant_data = pd.read_csv("../data/raw/applicants_clean.csv")
payroll_data = pd.read_csv("../data/raw/merged_payroll.csv")

In [292]:
payroll_data.head(5)

Unnamed: 0,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,938,12.19,8.56,104.35,WILD Steward,2020,12
1,938,14.69,8.8,129.28,WILD Steward,2020,12
2,939,32.73,8.56,280.17,WILD Steward,2020,12
3,939,24.0,8.8,211.2,WILD Steward,2020,12
4,1160,33.9,8.56,290.18,WILD Steward,2020,12


In [293]:
full_data = pd.merge(applicant_data, payroll_data, left_on='ID Number', right_on='EECode', how='inner')

full_data.to_csv("../data/clean/full_dataset.csv", index=False)

## Mapping categorical values to numerical values for model fitting preparations 

In [239]:
full_data = pd.read_csv("../data/clean/full_dataset.csv")
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,Yes,,,comfortable,...,2022,,False,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,Yes,,,comfortable,...,2022,,False,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,Yes,,,comfortable,...,2022,,False,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,Yes,,,comfortable,...,2022,,False,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,Yes,,,comfortable,...,2022,,False,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [240]:
full_data['First Paying Job'] = full_data['First Paying Job'].fillna('No').map({'Yes':1,'No':0})
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,comfortable,...,2022,,False,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,comfortable,...,2022,,False,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,comfortable,...,2022,,False,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,comfortable,...,2022,,False,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,comfortable,...,2022,,False,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [241]:
full_data['Speaking in front of a crowd rate'].fillna(2)
full_data['Speaking in front of a crowd rate'] = full_data['Speaking in front of a crowd rate'].map({'comfortable':1,'uncomfortable':0})
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,False,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,False,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,False,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,False,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,False,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [242]:
full_data['Rejected'] = full_data['Rejected'].map({True:1,False:0})
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,0,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,0,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,0,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,0,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,,,1.0,...,2022,,0,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [243]:
full_data['Know anyone who has been employeed'] = full_data['Know anyone who has been employeed'].fillna('no').map({'yes':1,'no':0})
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,,1.0,...,2022,,0,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,,1.0,...,2022,,0,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,,1.0,...,2022,,0,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,,1.0,...,2022,,0,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,,1.0,...,2022,,0,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [244]:
full_data['Applied Before'] = full_data['Applied Before'].fillna('no').map({'yes':1,'no':0})
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [245]:
full_data['Extra activities'] = full_data['Extra activities'].fillna('none').map({'some':1,'none':0})
full_data.head()

Unnamed: 0,ID Number,Preferred Pronouns,Date of Birth,Current Grade Level,Zipcode,Program,First Paying Job,Know anyone who has been employeed,Applied Before,Speaking in front of a crowd rate,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,2006-12-13 00:00:00,9.0,32218,zoocamp,1,0,0,1.0,...,2022,,0,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [248]:
# rename columns with awkward names
full_data = full_data.rename(columns={"Know anyone who has been employeed": "Know employee",
                                     "Speaking in front of a crowd rate": "Comfortable speaking to crowd",
                                     "Unconfortable to work with": "Uncomfortable with"})

In [249]:
# drop PII columns requested by sponsor
full_data = full_data.drop(columns=['Date of Birth','Zipcode'])

In [250]:
full_data.to_csv("../data/clean/full_dataset.csv", index=False)