# Survey Monkey Data Transformation

Original data from https://drive.google.com/drive/folders/1ZQJ37FcW4W_tYbz5nEkYX4Y1fqZzfdiF

In [1]:
# import data analysis packages
import pandas as pd
import os


# select folder path strig to variable cwd (current working directory)
pwd = os.getcwd()

# import .xlsx data file dataset (using pwd directory)
data = pd.read_excel(pwd + "\Data Output1_Edited.xlsx",
                     sheet_name="edited_data")

# create a copy of the original data
data_mod = data.copy() 


# import survey questions sparately
questions = pd.read_excel(pwd + "\Data Output1_Edited.xlsx",
                     sheet_name="question")

# create copy of questions data
quest = questions.copy()


In [3]:
# 1) Clean up data frame and unpivot into tall format

# inspect data and remove unwanted columns
print(data_mod.columns) 
columns_to_drop = ['Start Date', 'End Date', 'Email Address', 'First Name', 'Last Name', 'Custom Data 1']

data_modified = data_mod.drop(columns = columns_to_drop)


# select id_vars (constant) and vars_columns (unpivot) columns 
id_columns = ['Respondent ID', 'Identify which division you work in. - ',       # Columns up to 'Question 1' are 'demographic info' columns
       'Identify which division you work in. - Other (please specify)',         # Keep those 8 unpivoted as 'id_columns'
       'Which of the following best describes your position level? - Response', 
       'Which generation are you apart of? - Response',
       'Please select the gender in which you identify. - Response',
       'Which duration range best aligns with your tenure at your company? - Response',
       'Which of the following best describes your employment type? - Response']

vars_columns = list(data_mod.columns[8:])

# unpivoting data using .melt() with id_columns and vars_columns
data_mod1 = data_modified.melt(id_vars=id_columns,
                          value_vars=vars_columns,
                          var_name="Question", value_name="Answer")

print(data_mod1)

Index(['Respondent ID', 'Start Date', 'End Date', 'Email Address',
       'First Name', 'Last Name', 'Custom Data 1',
       'Identify which division you work in. - ',
       'Identify which division you work in. - Other (please specify)',
       'Which of the following best describes your position level? - Response',
       'Which generation are you apart of? - Response',
       'Please select the gender in which you identify. - Response',
       'Which duration range best aligns with your tenure at your company? - Response',
       'Which of the following best describes your employment type? - Response',
       'Question 1 - Response', 'Question 2 - Response',
       'Question 3 - Open-Ended Response', 'Question 4 - Response',
       'Question 4 - Other (please specify)', 'Question 5 - Response 1',
       'Question 5 - Response 2', 'Question 5 - Response 3',
       'Question 5 - Response 4', 'Question 5 - Response 5',
       'Question 5 - Response 6', 'Question 6 - Response 1',
     

In [5]:
# 2) Clean up Questions data frame and .merge() questions into data_mod1 as 'data_merged'


# select question columns of interest
quest_final = pd.DataFrame(quest.loc[:, ["Question", "Question + SubQuestion"]])

# drop any null values
quest_final.dropna(inplace=True)  

# join quest_final to data_mod1, adding 'Question + SubQuestion' column
data_merged = pd.merge(left=data_mod1,
                       right=quest_final,
                       how="left",
                       left_on="Question",
                       right_on="Question + SubQuestion")

# double-check output columns
print(data_merged.columns)

print("Original Data", len(data_mod1))                                      
print("Merged Data", len(data_merged))     


Index(['Respondent ID', 'Identify which division you work in. - ',
       'Identify which division you work in. - Other (please specify)',
       'Which of the following best describes your position level? - Response',
       'Which generation are you apart of? - Response',
       'Please select the gender in which you identify. - Response',
       'Which duration range best aligns with your tenure at your company? - Response',
       'Which of the following best describes your employment type? - Response',
       'Question_x', 'Answer', 'Question_y', 'Question + SubQuestion'],
      dtype='object')
Original Data 17028
Merged Data 17028


In [15]:
# 3) Calculate number of respondents per question data frame and .merge() results to data as 'data_merged_2'


# remove null rows in Answer column
respondents = data_merged[data_merged["Answer"].notna()] 
print(respondents)                                          # Gone from 17,000 rows to just under 10,000

# calculate number of respondants per question - use .reset_index() if indexes dissapear
respondents_calc = respondents.groupby("Question_y")["Respondent ID"].nunique().reset_index()  

# rename 'Respondent ID' column 
respondents_calc.rename(columns={"Respondent ID":"Respondent Count"}, 
                   inplace=True)

# merge calculated column to data
data_merged_2 = pd.merge(left=data_merged,
                       right=respondents_calc,
                       how="left",
                       left_on="Question_y",
                       right_on="Question_y")

# double-check output columns
print(data_merged_2.columns)

print("Original Data", len(data_merged))
print("Merged Data", len(data_merged_2))

       Respondent ID Identify which division you work in. -   \
1         2658722536                                 Finance   
2         4044163394                          Infrastructure   
3         5535865599                          Infrastructure   
5         3399511781                          Infrastructure   
6         9860597462                          Infrastructure   
...              ...                                     ...   
17017     9735550076                                  People   
17020     7325851635                          Infrastructure   
17021     3370365802    Port Security & Emergency Operations   
17023     7940065082                          Infrastructure   
17024     5157705612                                 Finance   

      Identify which division you work in. - Other (please specify)  \
1                                                    NaN              
2                                                    NaN              
3                 

In [28]:
# 4) Calculate number of respondents giving the same answer per question, .merge() results to data as 'data_merged_3'


# use 'raw' data from above for calculations
same_answer = data_merged

# calculate unique number of respondants providing same answer to each question question & subquestion combo
same_answer = same_answer.groupby(["Question + SubQuestion", "Answer"])["Respondent ID"].nunique().reset_index() 

# rename 'Respondent ID' column
same_answer.rename(columns={"Respondent ID":"Same Answer Count"},   
                   inplace=True)

# merge calculated column to data
data_merged_3 = pd.merge(left=data_merged_2,
                       right=same_answer,
                       how="left",
                       left_on=["Question + SubQuestion", "Answer"],
                       right_on=["Question + SubQuestion", "Answer"])

# double-check output columns
print(data_merged_3.columns)

print("Original Data", len(data_merged))
print("Merged Data", len(data_merged_3))


Index(['Respondent ID', 'Identify which division you work in. - ',
       'Identify which division you work in. - Other (please specify)',
       'Which of the following best describes your position level? - Response',
       'Which generation are you apart of? - Response',
       'Please select the gender in which you identify. - Response',
       'Which duration range best aligns with your tenure at your company? - Response',
       'Which of the following best describes your employment type? - Response',
       'Question_x', 'Answer', 'Question_y', 'Question + SubQuestion',
       'Respondent Count', 'Same Answer Count'],
      dtype='object')
Original Data 17028
Merged Data 17028


In [29]:
# 5) Clean up merged data and create final output file


# replace null values with 0
data_merged_3["Same Answer Count"].fillna(0, inplace=True)

# check if any null values left
data_merged_3["Same Answer Count"].isna().sum()


# find columns to rename
print(data_merged_3.columns)

# make a copy to edit and use as output
output = data_merged_3.copy()               

# rename demographic questions columns to shorter names
output.rename(columns={'Identify which division you work in. - ':'Division Primary',
       'Identify which division you work in. - Other (please specify)':'Division Secondary',
       'Which of the following best describes your position level? - Response':'Position',
       'Which generation are you apart of? - Response': 'Generation',
       'Please select the gender in which you identify. - Response':'Gender',
       'Which duration range best aligns with your tenure at your company? - Response':'Tenure',
       'Which of the following best describes your employment type? - Response':'Employment Type'},
       inplace=True)

print(output)


Index(['Respondent ID', 'Identify which division you work in. - ',
       'Identify which division you work in. - Other (please specify)',
       'Which of the following best describes your position level? - Response',
       'Which generation are you apart of? - Response',
       'Please select the gender in which you identify. - Response',
       'Which duration range best aligns with your tenure at your company? - Response',
       'Which of the following best describes your employment type? - Response',
       'Question_x', 'Answer', 'Question_y', 'Question + SubQuestion',
       'Respondent Count', 'Same Answer Count'],
      dtype='object')
       Respondent ID        Division Primary Division Secondary  \
0         5379192392          Infrastructure                NaN   
1         2658722536                 Finance                NaN   
2         4044163394          Infrastructure                NaN   
3         5535865599          Infrastructure                NaN   
4         

In [38]:
# 6) Clean up output data frame and export final_output


# drop columns 'Question_x','Answer', 'Question_y'
output_clean = output.drop(columns=["Question_x", "Answer", "Question_y"]) 

# export final output file
writer = pd.ExcelWriter("Final_Output.xlsx", engine="xlsxwriter")
output_clean.to_excel(writer, sheet_name="data",
                      index=False) 
writer.close()
