In [7]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd

In [8]:
# uses the file tanmay made for now, need to upadte to use script to generate filtered dataset
train_df_filtered = pd.read_csv('/Users/jonah/Downloads/filtered_data.csv')

train_df_filtered.describe(include='all')

Unnamed: 0.1,Unnamed: 0,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,Country,...,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,OpSysProfessional use,VersionControlSystem,VCInteraction,OfficeStackAsyncHaveWorkedWith,Age,WorkExp,ICorPM,ConvertedCompYearly
count,8042.0,8042,8042,8042,8042,8042.0,8042.0,8042,8042,8042,...,6444,7998,7792,8042,7950,5959,8042,5610.0,5558,8042.0
unique,,2,9,3,9,52.0,51.0,2110,10,2,...,441,2422,40,16,15,398,8,,2,
top,,I am a developer by profession,"Employed, full-time",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",10.0,5.0,"Developer, full-stack","10,000 or more employees",United States of America,...,Docker,Visual Studio Code,macOS,Git,Command-line,Confluence;Jira Work Management,25-34 years old,,Independent contributor,
freq,,7340,7239,4960,4601,612.0,560.0,1217,1589,6874,...,592,777,2287,7119,2177,1256,3413,,4799,
mean,36128.701069,,,,,,,,,,...,,,,,,,,13.129768,,352640.6
std,21238.664698,,,,,,,,,,...,,,,,,,,10.026063,,1169285.0
min,11.0,,,,,,,,,,...,,,,,,,,0.0,,36.0
25%,15715.0,,,,,,,,,,...,,,,,,,,5.0,,97605.0
50%,39136.0,,,,,,,,,,...,,,,,,,,10.0,,138000.0
75%,53352.0,,,,,,,,,,...,,,,,,,,19.0,,195157.5


In [9]:
def multianswer_col_trans(df, col_name):
    
    cv = CountVectorizer(tokenizer=lambda text: text.split(';'))
    df[col_name] = df[col_name].fillna('unspecified')
    sr_col_name = df[col_name]
    cv.fit(sr_col_name)
    columns = cv.get_feature_names_out()
    df_encoded = pd.DataFrame(cv.transform(sr_col_name).toarray(),
                        columns= columns,
                        index = sr_col_name.index)
    df_encoded = df_encoded.add_prefix(col_name + "_")
    return df_encoded

In [10]:
# Returns float values for different string inputs
def convert2float(x):
    if  x == 'More than 50 years' :
        return float(50)
    elif x == 'Less than 1 year':
        return float(0)
    else:
        return float(x)

In [11]:

# converts string year values to float
train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].apply(lambda x: convert2float(x))

train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))

In [12]:
# order for ordinal columns
education_order = ['Something else', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
                   'Some college/university study without earning a degree', 'Associate degree (A.A., A.S., etc.)', "Bachelor’s degree (B.A., B.S., B.Eng., etc.)", 
                   "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",'Professional degree (JD, MD, etc.)', 'Other doctoral degree (Ph.D., Ed.D., etc.)']

age_order = ['Prefer not to say', 'Under 18 years old', '18-24 years old','25-34 years old','35-44 years old','45-54 years old', '55-64 years old','65 years or older']

In [13]:
# columns for preprocessing 
multianswer_cols = [
'DevType',
'LanguageHaveWorkedWith',
'DatabaseHaveWorkedWith',
'PlatformHaveWorkedWith',
'WebframeHaveWorkedWith',
'MiscTechHaveWorkedWith',
'ToolsTechHaveWorkedWith',
'NEWCollabToolsHaveWorkedWith',
'OpSysProfessional use',
'VCInteraction',
'VersionControlSystem',
'OfficeStackAsyncHaveWorkedWith',
'Employment']

numeric_cols = ['YearsCode', 'YearsCodePro', 'WorkExp']

ordinal_edu = ['EdLevel']

ordinal_age = ['Age']

binary_cols = ['MainBranch', 'Country']

categorical_cols = ['OrgSize', 'RemoteWork']

drop_cols = ['ICorPM', 'Unnamed: 0']

numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

ordinal_age_transformer = make_pipeline(OrdinalEncoder(categories=[age_order], dtype=int))

binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', dtype=int))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))


preprocessor = make_column_transformer(
               (numeric_transformer, numeric_cols),
               (ordinal_edu_transformer, ordinal_edu),
               (ordinal_age_transformer, ordinal_age),
               (binary_transformer, binary_cols),
               (categorical_transformer, categorical_cols),
               ('drop', drop_cols )
               
)

In [14]:
# view preprocessor
preprocessor

In [15]:
# columns that dont contain multiple answers per response
column_names = (numeric_cols + ordinal_edu + ordinal_age + binary_cols + categorical_cols + drop_cols)

# subset train df
non_multianswer_train_df = train_df_filtered[column_names]

# fit preprocessor
preprocessor.fit_transform(non_multianswer_train_df)


# creates list of new column names from preprocessing pipelines
transformed_column_names = []

for i in range(1,6):
    temp_name = preprocessor.named_transformers_['pipeline-'+str(i)].get_feature_names_out().tolist()
    transformed_column_names += temp_name


In [16]:
# fit and transform the columns that dont contain multiple answers per response

transformed_train_df = pd.DataFrame(
                       preprocessor.fit_transform(non_multianswer_train_df),
                       columns = transformed_column_names)

transformed_train_df

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",OrgSize_100 to 499 employees,OrgSize_2 to 9 employees,OrgSize_20 to 99 employees,"OrgSize_5,000 to 9,999 employees",OrgSize_500 to 999 employees,OrgSize_I don’t know,"OrgSize_Just me - I am a freelancer, sole proprietor, etc.",RemoteWork_Full in-person,RemoteWork_Fully remote,"RemoteWork_Hybrid (some remote, some in-person)"
0,-0.537364,-0.168082,-0.256988,5.0,4.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.287344,-1.115992,-0.256988,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.493859,1.095797,-0.256988,6.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.243839,0.569180,-0.256988,5.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.306364,0.885150,-0.256988,6.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8037,-0.162374,0.042564,0.920079,5.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8038,-1.006101,-0.800022,-0.963228,4.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8039,0.587606,0.042564,-0.021574,6.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8040,0.025121,-0.378729,0.213839,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
# creates final transformed train df

for column in multianswer_cols:
    # count vectorizer for multi answer columns
    temp_col = multianswer_col_trans(train_df_filtered, column)

    transformed_train_df = pd.concat([transformed_train_df, temp_col], axis=1)

transformed_train_df




Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_swit,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_unspecified,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time","Employment_independent contractor, freelancer, or self-employed",Employment_retired
0,-0.537364,-0.168082,-0.256988,5.0,4.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,0,0,0
1,-1.287344,-1.115992,-0.256988,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
2,0.493859,1.095797,-0.256988,6.0,6.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
3,1.243839,0.569180,-0.256988,5.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
4,0.306364,0.885150,-0.256988,6.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8037,-0.162374,0.042564,0.920079,5.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
8038,-1.006101,-0.800022,-0.963228,4.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
8039,0.587606,0.042564,-0.021574,6.0,4.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
8040,0.025121,-0.378729,0.213839,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
