In [5]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
# uses the file tanmay made for now, need to upadte to use script to generate filtered dataset
df_filtered = pd.read_csv('data/processed/filtered_data.csv')

train_df_filtered, test_df_filtered = train_test_split(df_filtered, test_size=0.10, random_state=123)

train_df_filtered.describe(include='all')

Unnamed: 0.1,Unnamed: 0,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,Country,...,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,OpSysProfessional use,VersionControlSystem,VCInteraction,OfficeStackAsyncHaveWorkedWith,Age,WorkExp,ICorPM,ConvertedCompYearly
count,7237.0,7237,7237,7237,7237,7237.0,7237.0,7237,7237,7237,...,5782,7203,7015,7237,7153,5351,7237,5042.0,4997,7237.0
unique,,2,9,3,9,52.0,51.0,1937,10,2,...,413,2231,40,15,15,382,8,,2,
top,,I am a developer by profession,"Employed, full-time",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",10.0,5.0,"Developer, full-stack","10,000 or more employees",United States of America,...,Docker,Visual Studio Code,macOS,Git,Command-line,Confluence;Jira Work Management,25-34 years old,,Independent contributor,
freq,,6601,6510,4448,4148,547.0,504.0,1110,1435,6186,...,532,692,2071,6410,1946,1149,3060,,4328,
mean,36105.992953,,,,,,,,,,...,,,,,,,,13.082309,,349265.1
std,21197.998856,,,,,,,,,,...,,,,,,,,9.994571,,1164343.0
min,11.0,,,,,,,,,,...,,,,,,,,0.0,,36.0
25%,15745.0,,,,,,,,,,...,,,,,,,,5.0,,97605.0
50%,39065.0,,,,,,,,,,...,,,,,,,,10.0,,140000.0
75%,53204.0,,,,,,,,,,...,,,,,,,,19.0,,195210.0


In [7]:
def multianswer_col_trans(df, col_name):
    
    cv = CountVectorizer(tokenizer=lambda text: text.split(';'))
    df[col_name] = df[col_name].fillna('unspecified')
    sr_col_name = df[col_name]
    cv.fit(sr_col_name)
    columns = cv.get_feature_names_out()
    df_encoded = pd.DataFrame(cv.transform(sr_col_name).toarray(),
                        columns= columns,
                        index = sr_col_name.index)
    df_encoded = df_encoded.add_prefix(col_name + "_")
    return df_encoded

In [8]:
# Returns float values for different string inputs
def convert2float(x):
    if  x == 'More than 50 years' :
        return float(50)
    elif x == 'Less than 1 year':
        return float(0)
    else:
        return float(x)

In [9]:

# converts string year values to float
train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].apply(lambda x: convert2float(x))

train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))

In [10]:
# order for ordinal columns
education_order = ['Something else', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
                   'Some college/university study without earning a degree', 'Associate degree (A.A., A.S., etc.)', "Bachelor’s degree (B.A., B.S., B.Eng., etc.)", 
                   "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",'Professional degree (JD, MD, etc.)', 'Other doctoral degree (Ph.D., Ed.D., etc.)']

age_order = ['Prefer not to say', 'Under 18 years old', '18-24 years old','25-34 years old','35-44 years old','45-54 years old', '55-64 years old','65 years or older']

In [11]:
# columns for preprocessing 
multianswer_cols = [
'DevType',
'LanguageHaveWorkedWith',
'DatabaseHaveWorkedWith',
'PlatformHaveWorkedWith',
'WebframeHaveWorkedWith',
'MiscTechHaveWorkedWith',
'ToolsTechHaveWorkedWith',
'NEWCollabToolsHaveWorkedWith',
'OpSysProfessional use',
'VCInteraction',
'VersionControlSystem',
'OfficeStackAsyncHaveWorkedWith',
'Employment']

numeric_cols = ['YearsCode', 'YearsCodePro', 'WorkExp']

ordinal_edu = ['EdLevel']

ordinal_age = ['Age']

binary_cols = ['MainBranch', 'Country']

categorical_cols = ['OrgSize', 'RemoteWork']

passthrough_cols = ['ConvertedCompYearly']

drop_cols = ['ICorPM', 'Unnamed: 0']

numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

ordinal_age_transformer = make_pipeline(OrdinalEncoder(categories=[age_order], dtype=int))

binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', dtype=int))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))


preprocessor = make_column_transformer(
               (numeric_transformer, numeric_cols),
               (ordinal_edu_transformer, ordinal_edu),
               (ordinal_age_transformer, ordinal_age),
               (binary_transformer, binary_cols),
               (categorical_transformer, categorical_cols),
               ('passthrough', passthrough_cols),
               ('drop', drop_cols)
               
)

In [12]:
# view preprocessor
preprocessor

In [13]:
# columns that dont contain multiple answers per response
column_names = (numeric_cols + ordinal_edu + ordinal_age + binary_cols + categorical_cols + drop_cols + passthrough_cols)

# subset train df
non_multianswer_train_df = train_df_filtered[column_names]

# fit preprocessor
preprocessor.fit_transform(non_multianswer_train_df)


# creates list of new column names from preprocessing pipelines
transformed_column_names = []

for i in range(1,6):
    temp_name = preprocessor.named_transformers_['pipeline-'+str(i)].get_feature_names_out().tolist()
    transformed_column_names += temp_name

transformed_column_names += passthrough_cols

In [14]:
# fit and transform the columns that dont contain multiple answers per response

transformed_train_df = pd.DataFrame(
                       preprocessor.fit_transform(non_multianswer_train_df),
                       columns = transformed_column_names)

transformed_train_df

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OrgSize_2 to 9 employees,OrgSize_20 to 99 employees,"OrgSize_5,000 to 9,999 employees",OrgSize_500 to 999 employees,OrgSize_I don’t know,"OrgSize_Just me - I am a freelancer, sole proprietor, etc.",RemoteWork_Full in-person,RemoteWork_Fully remote,"RemoteWork_Hybrid (some remote, some in-person)",ConvertedCompYearly
0,-0.064760,0.046266,-0.017426,4.0,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,71028.0
1,0.782739,0.892316,0.928093,5.0,4.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,97600.0
2,1.253571,1.315341,-0.253805,6.0,6.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,250000.0
3,2.101069,2.478659,2.582751,5.0,6.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,98000.0
4,-1.383091,-1.117053,-1.199324,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,45913.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7232,-1.100591,-1.011297,-1.081134,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,130000.0
7233,-0.535592,-0.165247,-0.253805,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,78084.0
7234,0.311906,0.469291,0.455334,5.0,4.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,130000.0
7235,-0.253093,-0.165247,-0.017426,6.0,3.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,200000.0


In [15]:
# creates final transformed train df

for column in multianswer_cols:
    # count vectorizer for multi answer columns
    temp_col = multianswer_col_trans(train_df_filtered, column)
    temp_col.reset_index(drop=True, inplace=True)
    transformed_train_df = pd.concat([transformed_train_df, temp_col], axis=1)

transformed_train_df




Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_swit,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_unspecified,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time","Employment_independent contractor, freelancer, or self-employed",Employment_retired
0,-0.064760,0.046266,-0.017426,4.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
1,0.782739,0.892316,0.928093,5.0,4.0,0.0,0.0,1.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
2,1.253571,1.315341,-0.253805,6.0,6.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,0,0,0
3,2.101069,2.478659,2.582751,5.0,6.0,0.0,1.0,0.0,1.0,0.0,...,0,0,1,0,0,0,1,0,0,0
4,-1.383091,-1.117053,-1.199324,3.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7232,-1.100591,-1.011297,-1.081134,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
7233,-0.535592,-0.165247,-0.253805,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
7234,0.311906,0.469291,0.455334,5.0,4.0,1.0,1.0,0.0,0.0,1.0,...,0,1,0,0,0,0,1,0,0,0
7235,-0.253093,-0.165247,-0.017426,6.0,3.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
X_train = transformed_train_df.drop(columns=["ConvertedCompYearly"])
y_train = transformed_train_df["ConvertedCompYearly"]

# Feature Selection

In [17]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [18]:
preprocessor_fs = make_column_transformer(
    ('passthrough', X_train.columns)
)
preprocessor_fs.fit_transform(X_train)

select_lr = SelectFromModel(Ridge(), threshold="0.8*mean")

pipe_rf_model_based = make_pipeline(
    preprocessor_fs, select_lr, RandomForestRegressor(random_state=16)
)
pipe_rf_model_based.fit(X_train, y_train)

In [19]:
model_based_mask = pipe_rf_model_based.named_steps["selectfrommodel"].get_support()
mb_selected_feats = X_train.columns[model_based_mask]
fs = mb_selected_feats.tolist()

# Model Selection

In [20]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, cross_validate

In [35]:
cross_val_results_reg = {}

# scoring_metrics = 'f1'

models = {
    "Baseline": DummyRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Ridge": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(),
}

score_types_reg = {
    "neg_mean_squared_error": "neg_mean_squared_error",
    "neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "neg_mape": "neg_mean_absolute_percentage_error", 
    "r2": "r2",
}

In [28]:
def corss_validate_result(model_name, model_type):
    pipe = make_pipeline(
        model_type
    )
    cross_val_results_reg[model_name] = pd.DataFrame(cross_validate(pipe,
                                                                    X_train,
                                                                    y_train, 
                                                                    return_train_score=True, 
                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T

In [36]:
# corss_validate_result("KNN_reg", KNeighborsRegressor())
for model_item in models.items():
    model_name = model_item[0]
    model_type = model_item[1]
    # print(model_name)
    # print(type(model_type))
    corss_validate_result(model_name, model_type)

In [37]:
pd.concat(
    {key: pd.DataFrame(value) for key, value in cross_val_results_reg.items()}, 
    axis=1
)

Unnamed: 0_level_0,Baseline,Baseline,KNN Regressor,KNN Regressor,Ridge,Ridge,Random Forest Regressor,Random Forest Regressor
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
fit_time,0.017,0.001,0.064,0.002,0.076,0.004,68.78,2.164
score_time,0.004,0.0,0.146,0.01,0.024,0.003,0.154,0.008
test_neg_mean_squared_error,-1355838000000.0,239283600000.0,-1545849000000.0,262170700000.0,-1437829000000.0,256000000000.0,-1544693000000.0,281564400000.0
train_neg_mean_squared_error,-1355476000000.0,59826450000.0,-1014057000000.0,52278300000.0,-1277730000000.0,59035430000.0,-213380500000.0,12776090000.0
test_neg_root_mean_squared_error,-1160710.0,103622.2,-1239658.0,106636.6,-1195164.0,108465.3,-1238555.0,115516.7
train_neg_root_mean_squared_error,-1164022.0,25687.54,-1006739.0,25847.47,-1130127.0,26039.88,-461762.3,13970.41
test_neg_mean_absolute_percentage_error,-8.912,4.364,-5.768,2.105,-7.223,5.428,-11.468,7.912
train_neg_mean_absolute_percentage_error,-8.925,1.139,-4.509,0.667,-7.703,1.044,-4.136,1.044
test_r2,-0.001,0.0,-0.142,0.026,-0.061,0.017,-0.139,0.043
train_r2,0.0,0.0,0.252,0.019,0.057,0.005,0.843,0.006
