In [54]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split

In [55]:
# uses the file tanmay made for now, need to upadte to use script to generate filtered dataset
df_filtered = pd.read_csv('data/processed/filtered_data.csv')

In [56]:
df_filtered = df_filtered.query('ConvertedCompYearly < 1000000')

In [57]:
train_df_filtered, test_df_filtered = train_test_split(df_filtered, test_size=0.10, random_state=123)

train_df_filtered.describe(include='all')

Unnamed: 0.1,Unnamed: 0,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,Country,...,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,OpSysProfessional use,VersionControlSystem,VCInteraction,OfficeStackAsyncHaveWorkedWith,Age,WorkExp,ICorPM,ConvertedCompYearly
count,6869.0,6869,6869,6869,6869,6869.0,6869.0,6869,6869,6869,...,5501,6831,6666,6869,6782,5080,6869,4815.0,4770,6869.0
unique,,2,9,3,9,52.0,51.0,1849,10,2,...,415,2147,40,16,15,369,8,,2,
top,,I am a developer by profession,"Employed, full-time",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",10.0,5.0,"Developer, full-stack",100 to 499 employees,United States of America,...,Docker,Visual Studio Code,macOS,Git,Command-line,Confluence;Jira Work Management,25-34 years old,,Independent contributor,
freq,,6269,6200,4248,3937,533.0,479.0,1058,1360,5855,...,503,678,1954,6078,1833,1078,2930,,4123,
mean,36025.545494,,,,,,,,,,...,,,,,,,,13.131464,,153164.559325
std,21272.728685,,,,,,,,,,...,,,,,,,,10.054626,,102034.358668
min,11.0,,,,,,,,,,...,,,,,,,,0.0,,49.0
25%,15574.0,,,,,,,,,,...,,,,,,,,5.0,,95000.0
50%,39144.0,,,,,,,,,,...,,,,,,,,10.0,,132000.0
75%,53348.0,,,,,,,,,,...,,,,,,,,19.0,,180000.0


In [58]:
def multianswer_col_trans(df, col_name):
    
    cv = CountVectorizer(tokenizer=lambda text: text.split(';'))
    df[col_name] = df[col_name].fillna('unspecified')
    sr_col_name = df[col_name]
    cv.fit(sr_col_name)
    columns = cv.get_feature_names_out()
    df_encoded = pd.DataFrame(cv.transform(sr_col_name).toarray(),
                        columns= columns,
                        index = sr_col_name.index)
    df_encoded = df_encoded.add_prefix(col_name + "_")
    return df_encoded

In [59]:
# Returns float values for different string inputs
def convert2float(x):
    if  x == 'More than 50 years' :
        return float(50)
    elif x == 'Less than 1 year':
        return float(0)
    else:
        return float(x)

In [60]:

# converts string year values to float
train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].apply(lambda x: convert2float(x))

train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))

In [61]:
# order for ordinal columns
education_order = ['Something else', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
                   'Some college/university study without earning a degree', 'Associate degree (A.A., A.S., etc.)', "Bachelor’s degree (B.A., B.S., B.Eng., etc.)", 
                   "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",'Professional degree (JD, MD, etc.)', 'Other doctoral degree (Ph.D., Ed.D., etc.)']

age_order = ['Prefer not to say', 'Under 18 years old', '18-24 years old','25-34 years old','35-44 years old','45-54 years old', '55-64 years old','65 years or older']

In [62]:
# columns for preprocessing 
multianswer_cols = [
'DevType',
'LanguageHaveWorkedWith',
'DatabaseHaveWorkedWith',
'PlatformHaveWorkedWith',
'WebframeHaveWorkedWith',
'MiscTechHaveWorkedWith',
'ToolsTechHaveWorkedWith',
'NEWCollabToolsHaveWorkedWith',
'OpSysProfessional use',
'VCInteraction',
'VersionControlSystem',
'OfficeStackAsyncHaveWorkedWith',
'Employment']

numeric_cols = ['YearsCode', 'YearsCodePro', 'WorkExp']

ordinal_edu = ['EdLevel']

ordinal_age = ['Age']

binary_cols = ['MainBranch', 'Country']

categorical_cols = ['OrgSize', 'RemoteWork']

passthrough_cols = ['ConvertedCompYearly']

drop_cols = ['ICorPM', 'Unnamed: 0']

numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

ordinal_age_transformer = make_pipeline(OrdinalEncoder(categories=[age_order], dtype=int))

binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', dtype=int))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))


preprocessor = make_column_transformer(
               (numeric_transformer, numeric_cols),
               (ordinal_edu_transformer, ordinal_edu),
               (ordinal_age_transformer, ordinal_age),
               (binary_transformer, binary_cols),
               (categorical_transformer, categorical_cols),
               ('passthrough', passthrough_cols),
               ('drop', drop_cols)
               
)

In [63]:
# view preprocessor
preprocessor

In [64]:
# columns that dont contain multiple answers per response
column_names = (numeric_cols + ordinal_edu + ordinal_age + binary_cols + categorical_cols + drop_cols + passthrough_cols)

# subset train df
non_multianswer_train_df = train_df_filtered[column_names]

# fit preprocessor
preprocessor.fit_transform(non_multianswer_train_df)


# creates list of new column names from preprocessing pipelines
transformed_column_names = []

for i in range(1,6):
    temp_name = preprocessor.named_transformers_['pipeline-'+str(i)].get_feature_names_out().tolist()
    transformed_column_names += temp_name

transformed_column_names += passthrough_cols

In [65]:
# fit and transform the columns that dont contain multiple answers per response

transformed_train_df = pd.DataFrame(
                       preprocessor.fit_transform(non_multianswer_train_df),
                       columns = transformed_column_names)

transformed_train_df

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OrgSize_2 to 9 employees,OrgSize_20 to 99 employees,"OrgSize_5,000 to 9,999 employees",OrgSize_500 to 999 employees,OrgSize_I don’t know,"OrgSize_Just me - I am a freelancer, sole proprietor, etc.",RemoteWork_Full in-person,RemoteWork_Fully remote,"RemoteWork_Hybrid (some remote, some in-person)",ConvertedCompYearly
0,1.238008,0.042183,0.094269,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,160000.0
1,-0.446855,-0.272881,-0.257080,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,130000.0
2,-0.072441,0.042183,-0.022847,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,115000.0
3,-0.072441,0.042183,-0.022847,3.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,100000.0
4,1.050801,1.617504,4.427577,8.0,7.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,120000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6864,-0.446855,-0.482924,-0.842662,5.0,3.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,130000.0
6865,1.050801,1.302439,-0.257080,5.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,130000.0
6866,-0.634062,-0.692967,-0.257080,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,62467.0
6867,-0.540458,-0.587945,-0.725546,5.0,3.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,230000.0


In [66]:
# creates final transformed train df

for column in multianswer_cols:
    # count vectorizer for multi answer columns
    temp_col = multianswer_col_trans(train_df_filtered, column)
    temp_col.reset_index(drop=True, inplace=True)
    transformed_train_df = pd.concat([transformed_train_df, temp_col], axis=1)

transformed_train_df




Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_swit,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_unspecified,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time","Employment_independent contractor, freelancer, or self-employed",Employment_retired
0,1.238008,0.042183,0.094269,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
1,-0.446855,-0.272881,-0.257080,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,-0.072441,0.042183,-0.022847,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
3,-0.072441,0.042183,-0.022847,3.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0
4,1.050801,1.617504,4.427577,8.0,7.0,0.0,1.0,0.0,0.0,1.0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6864,-0.446855,-0.482924,-0.842662,5.0,3.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,0,0,0
6865,1.050801,1.302439,-0.257080,5.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,0,0,0
6866,-0.634062,-0.692967,-0.257080,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
6867,-0.540458,-0.587945,-0.725546,5.0,3.0,0.0,1.0,0.0,0.0,1.0,...,0,0,1,0,0,0,1,0,0,0


In [67]:
X_train = transformed_train_df.drop(columns=["ConvertedCompYearly"])
y_train = transformed_train_df["ConvertedCompYearly"]

# Feature Selection

In [68]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [69]:
preprocessor_fs = make_column_transformer(
    ('passthrough', X_train.columns)
)
preprocessor_fs.fit_transform(X_train)

select_lr = SelectFromModel(Ridge(), threshold="0.8*mean")

pipe_rf_model_based = make_pipeline(
    preprocessor_fs, select_lr, RandomForestRegressor(random_state=16)
)
pipe_rf_model_based.fit(X_train, y_train)

In [70]:
model_based_mask = pipe_rf_model_based.named_steps["selectfrommodel"].get_support()
mb_selected_feats = X_train.columns[model_based_mask]
fs = mb_selected_feats.tolist()

# Model Selection

In [71]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, cross_validate

In [72]:
cross_val_results_reg = {}

# scoring_metrics = 'f1'

models = {
    "Baseline": DummyRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Ridge": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Lasso": Lasso(),
}

score_types_reg = {
    #"neg_mean_squared_error": "neg_mean_squared_error",
    #"neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "neg_mape": "neg_mean_absolute_percentage_error", 
    "r2": "r2",
}

In [73]:
def corss_validate_result(model_name, model_type):
    pipe = make_pipeline(
        model_type
    )
    cross_val_results_reg[model_name] = pd.DataFrame(cross_validate(pipe,
                                                                    X_train,
                                                                    y_train, 
                                                                    return_train_score=True, 
                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T

In [74]:
for model_item in models.items():
    model_name = model_item[0]
    model_type = model_item[1]
    corss_validate_result(model_name, model_type)

In [75]:
pd.concat(
    {key: pd.DataFrame(value) for key, value in cross_val_results_reg.items()}, 
    axis=1
)

Unnamed: 0_level_0,Baseline,Baseline,KNN Regressor,KNN Regressor,Ridge,Ridge,Random Forest Regressor,Random Forest Regressor,Lasso,Lasso
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
fit_time,0.021,0.008,0.061,0.007,0.075,0.003,50.08,1.459,6.656,2.664
score_time,0.003,0.001,0.128,0.009,0.022,0.003,0.13,0.005,0.022,0.002
test_neg_mean_absolute_percentage_error,-3.328,1.748,-3.317,1.86,-3.404,1.821,-3.33,1.849,-3.412,1.822
train_neg_mean_absolute_percentage_error,-3.332,0.448,-2.999,0.409,-3.276,0.466,-1.214,0.161,-3.274,0.466
test_r2,-0.002,0.001,0.044,0.036,0.149,0.065,0.143,0.066,0.145,0.067
train_r2,0.0,0.0,0.364,0.014,0.24,0.016,0.876,0.004,0.241,0.016
