In [2]:
import pandas as pd
import numpy as np

In [3]:
survey_results = pd.read_csv(filepath_or_buffer="data/survey_results_public.csv", header=0)

In [4]:
survey_results.columns

Index(['ResponseId', 'Q120', 'MainBranch', 'Age', 'Employment', 'RemoteWork',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'YearsCode', 'YearsCodePro', 'DevType',
       'OrgSize', 'PurchaseInfluence', 'TechList', 'BuyNewTool', 'Country',
       'Currency', 'CompTotal', 'LanguageHaveWorkedWith',
       'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith',
       'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith',
       'PlatformWantToWorkWith', 'WebframeHaveWorkedWith',
       'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith',
       'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith',
       'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith',
       'NEWCollabToolsWantToWorkWith', 'OpSysPersonal use',
       'OpSysProfessional use', 'OfficeStackAsyncHaveWorkedWith',
       'OfficeStackAsyncWantToWorkWith', 'OfficeStackSyncHaveWorkedWith',
       'OfficeStackSyncWantToWorkWith', 'AISearchHaveWorkedWith',
       'AISearchWan

In [5]:
def unique_a_b(series_a, series_b, second_col=True):
    series_a_arr = series_a.str.split(";")
    a_b = exploded_series_a = series_a_arr.explode().unique()
    if second_col:
        series_b_arr = series_b.str.split(";")
        exploded_series_b = series_b_arr.explode().unique()
        a_b = np.concatenate([exploded_series_a, exploded_series_b])
    unique_a_b = np.array([e for e in a_b if str(e) != 'nan'])
    return unique_a_b

def get_boolean_cols(df: pd.DataFrame, col_name, col_2_name):
    """
    Given dataframe and 2 column names, it will find uniques across the 2 column names and then create
    extra columns in a copy of the df that are booleans for each unique value as to whether the entry has it or not.
    Note: Can also pass in just 1 column name
    """
    series_a_arr = df[col_name].str.split(";")
    new_df = df.copy(deep=True)
    new_df[col_name] = series_a_arr
    col_2 = df[col_2_name] if col_2_name else None
    col_2_bool = True if col_2_name else None
    unique_values = unique_a_b(df[col_name], col_2, col_2_bool)
    for val in unique_values:
        new_df[f"{col_name}={val}"] = new_df.apply(lambda x: 1 if str(x[col_name]) != 'nan' and val in x[col_name] else 0, axis=1)
    return new_df
        


In [6]:
# example usage
lang_cols = get_boolean_cols(survey_results, "LanguageHaveWorkedWith", "LanguageWantToWorkWith")
lang_cols["LanguageHaveWorkedWith=Python"]

0        0
1        1
2        0
3        0
4        0
        ..
89179    0
89180    1
89181    1
89182    1
89183    0
Name: LanguageHaveWorkedWith=Python, Length: 89184, dtype: int64

In [26]:
survey_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89184 entries, 0 to 89183
Data columns (total 81 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ResponseId                           89184 non-null  int64  
 1   MainBranch                           89184 non-null  object 
 2   Age                                  89184 non-null  object 
 3   Employment                           87898 non-null  object 
 4   RemoteWork                           73810 non-null  object 
 5   CodingActivities                     73764 non-null  object 
 6   EdLevel                              87973 non-null  object 
 7   LearnCode                            87663 non-null  object 
 8   LearnCodeOnline                      70084 non-null  object 
 9   LearnCodeCoursesCert                 37076 non-null  object 
 10  YearsCode                            87435 non-null  object 
 11  YearsCodePro                

In [7]:
print(survey_results['Age'].unique())

['18-24 years old' '25-34 years old' '45-54 years old' '35-44 years old'
 'Under 18 years old' '55-64 years old' '65 years or older'
 'Prefer not to say']


In [8]:
survey_results = survey_results.drop('Q120', axis=1)

In [23]:
survey_results = survey_results.drop('SurveyLength', axis=1)

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,...,Knowledge_8,Frequency_1,Frequency_2,Frequency_3,TimeSearching,TimeAnswering,ProfessionalTech,Industry,SurveyEase,ConvertedCompYearly
0,1,None of these,18-24 years old,,,,,,,,...,,,,,,,,,,
1,2,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Hobby;Contribute to open-source projects;Boots...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;Friend or fam...,Formal documentation provided by the owner of ...,Other,...,Strongly agree,1-2 times a week,10+ times a week,Never,15-30 minutes a day,15-30 minutes a day,DevOps function;Microservices;Automated testin...,"Information Services, IT, Software Development...",Easy,285000.0
2,3,I am a developer by profession,45-54 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby;Professional development or self-paced l...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Formal documentation provided by the owner of ...,,...,Agree,6-10 times a week,6-10 times a week,3-5 times a week,30-60 minutes a day,30-60 minutes a day,DevOps function;Microservices;Automated testin...,"Information Services, IT, Software Development...",Easy,250000.0
3,4,I am a developer by profession,25-34 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Colleague;Friend or family member;Other online...,Formal documentation provided by the owner of ...,,...,Agree,1-2 times a week,10+ times a week,1-2 times a week,15-30 minutes a day,30-60 minutes a day,Automated testing;Continuous integration (CI) ...,,Easy,156000.0
4,5,I am a developer by profession,25-34 years old,"Employed, full-time;Independent contractor, fr...",Remote,Hobby;Contribute to open-source projects;Profe...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Online Courses or Certi...,Formal documentation provided by the owner of ...,Other;Codecademy;edX,...,Agree,1-2 times a week,1-2 times a week,3-5 times a week,60-120 minutes a day,30-60 minutes a day,Microservices;Automated testing;Observability ...,Other,Neither easy nor difficult,23456.0


In [9]:
survey_results = survey_results.drop("SurveyEase",axis=1)
survey_results.head()


Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,...,Knowledge_8,Frequency_1,Frequency_2,Frequency_3,TimeSearching,TimeAnswering,ProfessionalTech,Industry,SurveyLength,ConvertedCompYearly
0,1,None of these,18-24 years old,,,,,,,,...,,,,,,,,,,
1,2,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Hobby;Contribute to open-source projects;Boots...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;Friend or fam...,Formal documentation provided by the owner of ...,Other,...,Strongly agree,1-2 times a week,10+ times a week,Never,15-30 minutes a day,15-30 minutes a day,DevOps function;Microservices;Automated testin...,"Information Services, IT, Software Development...",Appropriate in length,285000.0
2,3,I am a developer by profession,45-54 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby;Professional development or self-paced l...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Formal documentation provided by the owner of ...,,...,Agree,6-10 times a week,6-10 times a week,3-5 times a week,30-60 minutes a day,30-60 minutes a day,DevOps function;Microservices;Automated testin...,"Information Services, IT, Software Development...",Appropriate in length,250000.0
3,4,I am a developer by profession,25-34 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Colleague;Friend or family member;Other online...,Formal documentation provided by the owner of ...,,...,Agree,1-2 times a week,10+ times a week,1-2 times a week,15-30 minutes a day,30-60 minutes a day,Automated testing;Continuous integration (CI) ...,,Appropriate in length,156000.0
4,5,I am a developer by profession,25-34 years old,"Employed, full-time;Independent contractor, fr...",Remote,Hobby;Contribute to open-source projects;Profe...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Online Courses or Certi...,Formal documentation provided by the owner of ...,Other;Codecademy;edX,...,Agree,1-2 times a week,1-2 times a week,3-5 times a week,60-120 minutes a day,30-60 minutes a day,Microservices;Automated testing;Observability ...,Other,Appropriate in length,23456.0


In [11]:
onlineCourseAndCerts = get_boolean_cols(survey_results, "LearnCodeCoursesCert",col_2_name=False)


In [12]:
onlineCourseAndCerts.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,...,SurveyLength,ConvertedCompYearly,LearnCodeCoursesCert=Other,LearnCodeCoursesCert=Codecademy,LearnCodeCoursesCert=edX,LearnCodeCoursesCert=Udemy,LearnCodeCoursesCert=Pluralsight,LearnCodeCoursesCert=Coursera,LearnCodeCoursesCert=Udacity,LearnCodeCoursesCert=Skillsoft
0,1,None of these,18-24 years old,,,,,,,,...,,,0,0,0,0,0,0,0,0
1,2,I am a developer by profession,25-34 years old,"Employed, full-time",Remote,Hobby;Contribute to open-source projects;Boots...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;Friend or fam...,Formal documentation provided by the owner of ...,[Other],...,Appropriate in length,285000.0,1,0,0,0,0,0,0,0
2,3,I am a developer by profession,45-54 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby;Professional development or self-paced l...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Formal documentation provided by the owner of ...,,...,Appropriate in length,250000.0,0,0,0,0,0,0,0,0
3,4,I am a developer by profession,25-34 years old,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Colleague;Friend or family member;Other online...,Formal documentation provided by the owner of ...,,...,Appropriate in length,156000.0,0,0,0,0,0,0,0,0
4,5,I am a developer by profession,25-34 years old,"Employed, full-time;Independent contractor, fr...",Remote,Hobby;Contribute to open-source projects;Profe...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Online Courses or Certi...,Formal documentation provided by the owner of ...,"[Other, Codecademy, edX]",...,Appropriate in length,23456.0,1,1,1,0,0,0,0,0


In [14]:

binary_cols = [col for col in survey_results.columns if set(survey_results[col]) == {0, 1}]

# Calculate the correlation between each binary column and 'Converted Comp'
correlations = {}
for col in binary_cols:
    correlations[col] = survey_results[col].corr(survey_results['ConvertedCompYearly'])
    
print("Correlation of each binary column with 'pay':")
for col, corr in correlations.items():
    print(f"{col}: {corr}")

Correlation of each binary column with 'pay':
