In [77]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split

In [78]:
# uses the file tanmay made for now, need to upadte to use script to generate filtered dataset
df_filtered = pd.read_csv('data/processed/filtered_data.csv')

In [79]:
df_filtered = df_filtered.query('ConvertedCompYearly < 500000')

In [80]:
train_df_filtered, test_df_filtered = train_test_split(df_filtered, test_size=0.10, random_state=123)

train_df_filtered.describe(include='all')

Unnamed: 0.1,Unnamed: 0,MainBranch,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,DevType,OrgSize,Country,...,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,OpSysProfessional use,VersionControlSystem,VCInteraction,OfficeStackAsyncHaveWorkedWith,Age,WorkExp,ICorPM,ConvertedCompYearly
count,6753.0,6753,6753,6753,6753,6753.0,6753.0,6753,6753,6753,...,5406,6716,6547,6753,6673,5017,6753,4731.0,4689,6753.0
unique,,2,9,3,9,52.0,51.0,1807,10,2,...,409,2114,38,15,15,373,8,,2,
top,,I am a developer by profession,"Employed, full-time",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",10.0,5.0,"Developer, full-stack",100 to 499 employees,United States of America,...,Docker,Visual Studio Code,macOS,Git,Command-line,Confluence;Jira Work Management,25-34 years old,,Independent contributor,
freq,,6161,6086,4181,3851,516.0,475.0,1013,1343,5756,...,514,645,1917,5981,1823,1071,2895,,4044,
mean,36264.240338,,,,,,,,,,...,,,,,,,,13.147749,,143988.600622
std,21212.800625,,,,,,,,,,...,,,,,,,,10.081693,,72454.997649
min,11.0,,,,,,,,,,...,,,,,,,,0.0,,36.0
25%,15729.0,,,,,,,,,,...,,,,,,,,5.0,,94500.0
50%,39243.0,,,,,,,,,,...,,,,,,,,10.0,,130000.0
75%,53463.0,,,,,,,,,,...,,,,,,,,19.0,,177500.0


In [81]:
def multianswer_col_trans(df, col_name):
    
    cv = CountVectorizer(tokenizer=lambda text: text.split(';'))
    df[col_name] = df[col_name].fillna('unspecified')
    sr_col_name = df[col_name]
    cv.fit(sr_col_name)
    columns = cv.get_feature_names_out()
    df_encoded = pd.DataFrame(cv.transform(sr_col_name).toarray(),
                        columns= columns,
                        index = sr_col_name.index)
    df_encoded = df_encoded.add_prefix(col_name + "_")
    return df_encoded

In [82]:
# Returns float values for different string inputs
def convert2float(x):
    if  x == 'More than 50 years' :
        return float(50)
    elif x == 'Less than 1 year':
        return float(0)
    else:
        return float(x)

In [83]:

# converts string year values to float
train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].apply(lambda x: convert2float(x))

train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))

In [84]:
# order for ordinal columns
education_order = ['Something else', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
                   'Some college/university study without earning a degree', 'Associate degree (A.A., A.S., etc.)', "Bachelor’s degree (B.A., B.S., B.Eng., etc.)", 
                   "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",'Professional degree (JD, MD, etc.)', 'Other doctoral degree (Ph.D., Ed.D., etc.)']

age_order = ['Prefer not to say', 'Under 18 years old', '18-24 years old','25-34 years old','35-44 years old','45-54 years old', '55-64 years old','65 years or older']

In [85]:
# columns for preprocessing 
multianswer_cols = [
'DevType',
'LanguageHaveWorkedWith',
'DatabaseHaveWorkedWith',
'PlatformHaveWorkedWith',
'WebframeHaveWorkedWith',
'MiscTechHaveWorkedWith',
'ToolsTechHaveWorkedWith',
'NEWCollabToolsHaveWorkedWith',
'OpSysProfessional use',
'VCInteraction',
'VersionControlSystem',
'OfficeStackAsyncHaveWorkedWith',
'Employment']

numeric_cols = ['YearsCode', 'YearsCodePro', 'WorkExp']

ordinal_edu = ['EdLevel']

ordinal_age = ['Age']

binary_cols = ['MainBranch', 'Country']

categorical_cols = ['OrgSize', 'RemoteWork']

passthrough_cols = ['ConvertedCompYearly']

drop_cols = ['ICorPM', 'Unnamed: 0']

numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

ordinal_age_transformer = make_pipeline(OrdinalEncoder(categories=[age_order], dtype=int))

binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', dtype=int))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))


preprocessor = make_column_transformer(
               (numeric_transformer, numeric_cols),
               (ordinal_edu_transformer, ordinal_edu),
               (ordinal_age_transformer, ordinal_age),
               (binary_transformer, binary_cols),
               (categorical_transformer, categorical_cols),
               ('passthrough', passthrough_cols),
               ('drop', drop_cols)
               
)

In [86]:
# view preprocessor
preprocessor

In [87]:
# columns that dont contain multiple answers per response
column_names = (numeric_cols + ordinal_edu + ordinal_age + binary_cols + categorical_cols + drop_cols + passthrough_cols)

# subset train df
non_multianswer_train_df = train_df_filtered[column_names]

# fit preprocessor
preprocessor.fit_transform(non_multianswer_train_df)


# creates list of new column names from preprocessing pipelines
transformed_column_names = []

for i in range(1,6):
    temp_name = preprocessor.named_transformers_['pipeline-'+str(i)].get_feature_names_out().tolist()
    transformed_column_names += temp_name

transformed_column_names += passthrough_cols

In [88]:
# fit and transform the columns that dont contain multiple answers per response

transformed_train_df = pd.DataFrame(
                       preprocessor.fit_transform(non_multianswer_train_df),
                       columns = transformed_column_names)

transformed_train_df

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OrgSize_2 to 9 employees,OrgSize_20 to 99 employees,"OrgSize_5,000 to 9,999 employees",OrgSize_500 to 999 employees,OrgSize_I don’t know,"OrgSize_Just me - I am a freelancer, sole proprietor, etc.",RemoteWork_Full in-person,RemoteWork_Fully remote,"RemoteWork_Hybrid (some remote, some in-person)",ConvertedCompYearly
0,0.310712,-0.374310,-0.491277,8.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,277000.0
1,0.216894,0.681060,-0.257627,6.0,5.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,160000.0
2,-0.439831,-0.268773,-0.374452,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117126.0
3,-0.158377,-0.163236,-0.257627,5.0,3.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,150000.0
4,0.123076,-0.163236,-0.257627,6.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,105000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6748,0.310712,0.364449,1.377922,6.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,160000.0
6749,-0.158377,-0.374310,-0.257627,8.0,4.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,175000.0
6750,-1.002738,-0.901995,-0.257627,5.0,3.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,105000.0
6751,-0.439831,-0.268773,-0.257627,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,130000.0


In [89]:
# creates final transformed train df

for column in multianswer_cols:
    # count vectorizer for multi answer columns
    temp_col = multianswer_col_trans(train_df_filtered, column)
    temp_col.reset_index(drop=True, inplace=True)
    transformed_train_df = pd.concat([transformed_train_df, temp_col], axis=1)

transformed_train_df




Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_swit,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_unspecified,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time","Employment_independent contractor, freelancer, or self-employed",Employment_retired
0,0.310712,-0.374310,-0.491277,8.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
1,0.216894,0.681060,-0.257627,6.0,5.0,0.0,1.0,1.0,0.0,0.0,...,0,1,0,0,0,0,1,0,0,0
2,-0.439831,-0.268773,-0.374452,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,-0.158377,-0.163236,-0.257627,5.0,3.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,0.123076,-0.163236,-0.257627,6.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6748,0.310712,0.364449,1.377922,6.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
6749,-0.158377,-0.374310,-0.257627,8.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0
6750,-1.002738,-0.901995,-0.257627,5.0,3.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,1,0,0,0
6751,-0.439831,-0.268773,-0.257627,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [90]:
X_train = transformed_train_df.drop(columns=["ConvertedCompYearly"])
y_train = transformed_train_df["ConvertedCompYearly"]

# Feature Selection

In [91]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [92]:
preprocessor_fs = make_column_transformer(
    ('passthrough', X_train.columns)
)
preprocessor_fs.fit_transform(X_train)

select_lr = SelectFromModel(Ridge(), threshold="0.8*mean")

pipe_rf_model_based = make_pipeline(
    preprocessor_fs, select_lr, RandomForestRegressor(random_state=16)
)
pipe_rf_model_based.fit(X_train, y_train)

In [93]:
model_based_mask = pipe_rf_model_based.named_steps["selectfrommodel"].get_support()
mb_selected_feats = X_train.columns[model_based_mask]
fs = mb_selected_feats.tolist()

# Model Selection

In [94]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, cross_validate

In [95]:
cross_val_results_reg = {}

# scoring_metrics = 'f1'

models = {
    "Baseline": DummyRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Ridge": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Lasso": Lasso(),
}

score_types_reg = {
    #"neg_mean_squared_error": "neg_mean_squared_error",
    #"neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "neg_mape": "neg_mean_absolute_percentage_error", 
    "r2": "r2",
}

In [96]:
def corss_validate_result(model_name, model_type):
    pipe = make_pipeline(
        model_type
    )
    cross_val_results_reg[model_name] = pd.DataFrame(cross_validate(pipe,
                                                                    X_train,
                                                                    y_train, 
                                                                    return_train_score=True, 
                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T

In [97]:
for model_item in models.items():
    model_name = model_item[0]
    model_type = model_item[1]
    corss_validate_result(model_name, model_type)

In [98]:
pd.concat(
    {key: pd.DataFrame(value) for key, value in cross_val_results_reg.items()}, 
    axis=1
)

Unnamed: 0_level_0,Baseline,Baseline,KNN Regressor,KNN Regressor,Ridge,Ridge,Random Forest Regressor,Random Forest Regressor,Lasso,Lasso
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
fit_time,0.018,0.009,0.063,0.006,0.078,0.002,47.773,1.894,2.209,0.955
score_time,0.002,0.001,0.125,0.022,0.022,0.002,0.135,0.008,0.024,0.003
test_neg_mean_absolute_percentage_error,-4.119,2.932,-4.306,3.396,-4.518,3.816,-4.098,3.127,-4.527,3.843
train_neg_mean_absolute_percentage_error,-4.114,0.724,-3.537,0.75,-4.283,0.847,-1.571,0.278,-4.292,0.856
test_r2,-0.001,0.001,0.17,0.02,0.303,0.026,0.325,0.012,0.301,0.027
train_r2,0.0,0.0,0.442,0.002,0.38,0.005,0.905,0.001,0.381,0.005
