In [157]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [158]:
cols_to_choose = [
    'MainBranch',
    'Employment',
    'RemoteWork',
    'EdLevel',
    'YearsCode',
    'YearsCodePro',
    'DevType',
    'OrgSize',
    'Country',
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith',
    'PlatformHaveWorkedWith',
    'WebframeHaveWorkedWith',
    'MiscTechHaveWorkedWith',
    'ToolsTechHaveWorkedWith',
    'NEWCollabToolsHaveWorkedWith',
    'OpSysProfessional use',
    'VersionControlSystem',
    'VCInteraction',
    'OfficeStackAsyncHaveWorkedWith',
    'Age',
    'WorkExp',
    'ICorPM',
    'ConvertedCompYearly']

In [159]:
# order for ordinal columns
education_order = ['Something else', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
                   'Some college/university study without earning a degree', 'Associate degree (A.A., A.S., etc.)', "Bachelor’s degree (B.A., B.S., B.Eng., etc.)", 
                   "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",'Professional degree (JD, MD, etc.)', 'Other doctoral degree (Ph.D., Ed.D., etc.)']

age_order = ['Prefer not to say', 'Under 18 years old', '18-24 years old','25-34 years old','35-44 years old','45-54 years old', '55-64 years old','65 years or older']

In [160]:
numeric_cols = ['YearsCode', 'YearsCodePro', 'WorkExp']

ordinal_edu = ['EdLevel']

ordinal_age = ['Age']

binary_cols = ['MainBranch', 'Country']

categorical_cols = ['OrgSize', 'RemoteWork']

multianswer_cols = [
    'DevType',
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith',
    'PlatformHaveWorkedWith',
    'WebframeHaveWorkedWith',
    'MiscTechHaveWorkedWith',
    'ToolsTechHaveWorkedWith',
    'NEWCollabToolsHaveWorkedWith',
    'OpSysProfessional use',
    'VCInteraction',
    'VersionControlSystem',
    'OfficeStackAsyncHaveWorkedWith',
    'Employment']


passthrough_cols = ['ConvertedCompYearly']

drop_cols = ['ICorPM']

In [161]:
df_raw = pd.read_csv('data/raw/survey_results_public.csv')
north_america_data = df_raw.query("Country == 'United States of America' or Country == 'Canada'")
north_america_data = north_america_data[cols_to_choose]
north_america_data= north_america_data.query('ConvertedCompYearly < 500000')

In [162]:
# north_america_data['MainBranch'].unique()

In [163]:
df_filtered = north_america_data
df_filtered = write_na_values_for_cols(df_filtered, multianswer_cols)

In [164]:
train_df_filtered, test_df_filtered = train_test_split(df_filtered, test_size=0.10, random_state=522)

train_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8558 entries, 15874 to 13613
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   MainBranch                      8558 non-null   object 
 1   Employment                      8558 non-null   object 
 2   RemoteWork                      8544 non-null   object 
 3   EdLevel                         8558 non-null   object 
 4   YearsCode                       8543 non-null   object 
 5   YearsCodePro                    8534 non-null   object 
 6   DevType                         8558 non-null   object 
 7   OrgSize                         8555 non-null   object 
 8   Country                         8558 non-null   object 
 9   LanguageHaveWorkedWith          8558 non-null   object 
 10  DatabaseHaveWorkedWith          8558 non-null   object 
 11  PlatformHaveWorkedWith          8558 non-null   object 
 12  WebframeHaveWorkedWith       

In [165]:
# def multianswer_col_trans(df, col_name):
    
#     cv = CountVectorizer(tokenizer=lambda text: text.split(';'))
#     df[col_name] = df[col_name].fillna('unspecified')
#     sr_col_name = df[col_name]
#     cv.fit(sr_col_name)
#     columns = cv.get_feature_names_out()
#     df_encoded = pd.DataFrame(cv.transform(sr_col_name).toarray(),
#                         columns= columns,
#                         index = sr_col_name.index)
#     df_encoded = df_encoded.add_prefix(col_name + "_")
#     return df_encoded

In [166]:
def write_na_values_for_cols(df, list_of_cols, fillna_base_text = 'unspecified'):
    temp_df = df.copy()
    for col in list_of_cols:
        fillna_text_final = col + "_" + fillna_base_text
        temp_df[col] = temp_df[col].fillna(fillna_text_final)
        
    return temp_df

In [167]:
# Returns float values for different string inputs
def convert2float(x):
    if  x == 'More than 50 years' :
        return float(50)
    elif x == 'Less than 1 year':
        return float(0)
    else:
        return float(x)

In [193]:
def get_column_names_from_preporcessor(preprocessor):
    transformed_column_names = []
    for i in range(1,6):
        temp_names = preprocessor.named_transformers_['pipeline-'+str(i)].get_feature_names_out().tolist()
        transformed_column_names += temp_names

    for i in range(1,14):
        temp_names = preprocessor.named_transformers_['countvectorizer-'+str(i)].get_feature_names_out().tolist()
        # print(temp_names)
        for name in temp_names:
            name = multianswer_cols[i-1] + "_" + name
            transformed_column_names.append(name)

    transformed_column_names.append('ConvertedCompYearly')
    # print(transformed_column_names)

    return transformed_column_names

In [168]:
# train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].replace(np.nan, train_df_filtered['YearsCode'].mode()[0])
# train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].replace(np.nan, train_df_filtered['YearsCodePro'].mode()[0])
# test_df_filtered['YearsCode'] = test_df_filtered['YearsCode'].replace(np.nan, test_df_filtered['YearsCode'].mode()[0])
# test_df_filtered['YearsCodePro'] = test_df_filtered['YearsCodePro'].replace(np.nan, test_df_filtered['YearsCodePro'].mode()[0])

In [169]:
# converts string year values to float
train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].apply(lambda x: convert2float(x))
train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))
test_df_filtered['YearsCode'] = test_df_filtered['YearsCode'].apply(lambda x: convert2float(x))
test_df_filtered['YearsCodePro'] = test_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))

In [170]:
# replace nan values from the multianswer_cols
# train_df_filtered = write_na_values_for_cols(train_df_filtered, multianswer_cols)

In [171]:
# train_df_filtered = write_na_values_for_cols(train_df_filtered, ['MainBranch', 'RemoteWork', 'OrgSize'])

In [172]:
# train_df_filtered.query('YearsCode.isnull()')
# train_df_filtered.describe(include='all')

In [191]:
len(train_df_filtered['EdLevel'].unique())

9

In [192]:
len(test_df_filtered['EdLevel'].unique())

9

In [174]:
numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

ordinal_age_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(categories=[age_order], dtype=int))

binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore', dtype=int))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))

preprocessor = make_column_transformer(
            (numeric_transformer, numeric_cols),
            (ordinal_edu_transformer, ordinal_edu),
            (ordinal_age_transformer, ordinal_age),
            (binary_transformer, binary_cols),
            (categorical_transformer, categorical_cols),
            ('passthrough', passthrough_cols),
            ('drop', drop_cols),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[0]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[1]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[2]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[3]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[4]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[5]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[6]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[7]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[8]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[9]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[10]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[11]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[12])
)

train_df_filtered_encode = preprocessor.fit_transform(train_df_filtered).todense()
train_df_filtered_encode

test_df_filtered_encode = preprocessor.fit_transform(test_df_filtered).todense()
test_df_filtered_encode

matrix([[ 0.61667321,  1.00024528, -0.48050733, ...,  0.        ,
          1.        ,  0.        ],
        [-0.28046037, -0.88064056, -0.93160553, ...,  0.        ,
          0.        ,  0.        ],
        [-1.17759396, -0.97963455, -0.48050733, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [-0.28046037, -0.18768262, -0.48050733, ...,  0.        ,
          0.        ,  0.        ],
        [ 1.15495336,  0.10929936,  0.19613996, ...,  0.        ,
          0.        ,  0.        ],
        [-1.0878806 , -1.17762253, -1.15715462, ...,  0.        ,
          0.        ,  0.        ]])

In [175]:
# numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

# ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

# ordinal_age_transformer = make_pipeline(OrdinalEncoder(categories=[age_order], dtype=int))

# binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore', dtype=int))

# categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))


# preprocessor = make_column_transformer(
#                (numeric_transformer, numeric_cols),
#                (ordinal_edu_transformer, ordinal_edu),
#                (ordinal_age_transformer, ordinal_age),
#                (binary_transformer, binary_cols),
#                (categorical_transformer, categorical_cols),
#                # ('passthrough', passthrough_cols),
#                ('drop', drop_cols),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[0]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[1]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[2]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[3]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[4]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[5]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[6]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[7]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[8]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[9]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[10]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[11]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[12])
# )

In [176]:
# view preprocessor
preprocessor

In [177]:
X_train = train_df_filtered.drop(columns=["ConvertedCompYearly"])
y_train = train_df_filtered["ConvertedCompYearly"]

X_test = test_df_filtered.drop(columns=["ConvertedCompYearly"])
y_test = test_df_filtered["ConvertedCompYearly"]

In [178]:
# preprocessor.get_feature_names_out

In [194]:
# fit preprocessor with train df
train_df_filtered_encode = preprocessor.fit_transform(train_df_filtered).todense()

# creates list of new column names from preprocessing pipelines for train df
transformed_column_names_train = get_column_names_from_preporcessor(preprocessor)

In [195]:
print(len(transformed_column_names_train))
# transformed_column_names

266


In [196]:
train_enc = pd.DataFrame(
    data=train_df_filtered_encode, 
    index=train_df_filtered.index, 
    columns=transformed_column_names
)
train_enc

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time",Employment_i prefer not to say,"Employment_independent contractor, freelancer, or self-employed",Employment_retired,ConvertedCompYearly
15874,-0.909207,-1.010687,-1.195821,5.0,2.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3051,-0.628173,-0.694879,-0.261810,5.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
63736,-0.534495,-0.168532,-0.261810,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12895,-0.628173,-0.484340,-0.261810,5.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30849,-0.815529,-0.694879,-0.962318,5.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39712,-0.628173,-0.905417,-0.845567,6.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
63757,-0.628173,-0.484340,-0.261810,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
38135,1.058033,2.147394,-0.261810,5.0,6.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
24543,-0.066104,-0.168532,-0.261810,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [197]:
# fit preprocessor with test df
test_df_filtered_encode = preprocessor.fit_transform(test_df_filtered).todense()

# creates list of new column names from preprocessing pipelines for test df
transformed_column_names_test = get_column_names_from_preporcessor(preprocessor)

In [199]:
test_enc = pd.DataFrame(
    data=test_df_filtered_encode, 
    index=test_df_filtered.index, 
    columns=transformed_column_names_test
)
test_enc

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_smartsheet,OfficeStackAsyncHaveWorkedWith_stack overflow for teams,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time","Employment_independent contractor, freelancer, or self-employed",Employment_retired,ConvertedCompYearly
26873,0.616673,1.000245,-0.480507,3.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
192,-0.280460,-0.880641,-0.931606,5.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
54295,-1.177594,-0.979635,-0.480507,5.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26766,-0.818741,-0.781647,-0.818831,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
21538,-0.639314,-0.484665,-0.480507,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26943,0.616673,1.198233,-0.480507,5.0,5.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53816,0.885813,1.495215,1.774984,2.0,5.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14496,-0.280460,-0.187683,-0.480507,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
39042,1.154953,0.109299,0.196140,6.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [183]:
# # fit and transform the columns that dont contain multiple answers per response

# transformed_train_df = pd.DataFrame(
#                        preprocessor.fit_transform(non_multianswer_train_df),
#                        columns = transformed_column_names)

# transformed_train_df

In [184]:
# # creates final transformed train df

# for column in multianswer_cols:
#     # count vectorizer for multi answer columns
#     temp_col = multianswer_col_trans(train_df_filtered, column)
#     temp_col.reset_index(drop=True, inplace=True)
#     transformed_train_df = pd.concat([transformed_train_df, temp_col], axis=1)

# transformed_train_df


# Feature Selection

In [200]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [201]:
preprocessor = make_column_transformer(
            (numeric_transformer, numeric_cols),
            (ordinal_edu_transformer, ordinal_edu),
            (ordinal_age_transformer, ordinal_age),
            (binary_transformer, binary_cols),
            (categorical_transformer, categorical_cols),
            # ('passthrough', passthrough_cols),
            ('drop', drop_cols),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[0]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[1]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[2]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[3]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[4]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[5]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[6]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[7]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[8]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[9]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[10]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[11]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[12])
    )

In [205]:
# fit preprocessor with test df
X_train_encode = preprocessor.fit_transform(X_train).todense()

# creates list of new column names from preprocessing pipelines for test df
transformed_column_names = get_column_names_from_preporcessor(preprocessor)

X_train_enc = pd.DataFrame(
    data=X_train_encode, 
    index=X_train.index, 
    columns=transformed_column_names[:-1]
)
X_train_enc

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_swit,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time",Employment_i prefer not to say,"Employment_independent contractor, freelancer, or self-employed",Employment_retired
15874,-0.909207,-1.010687,-1.195821,5.0,2.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3051,-0.628173,-0.694879,-0.261810,5.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
63736,-0.534495,-0.168532,-0.261810,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12895,-0.628173,-0.484340,-0.261810,5.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30849,-0.815529,-0.694879,-0.962318,5.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39712,-0.628173,-0.905417,-0.845567,6.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
63757,-0.628173,-0.484340,-0.261810,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
38135,1.058033,2.147394,-0.261810,5.0,6.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
24543,-0.066104,-0.168532,-0.261810,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [206]:
select_lr = SelectFromModel(Ridge(), threshold="0.8*mean")

pipe_rf_model_based = make_pipeline(
    preprocessor, select_lr, RandomForestRegressor(random_state=16)
)
pipe_rf_model_based.fit(X_train, y_train)

In [207]:
model_based_mask = pipe_rf_model_based.named_steps["selectfrommodel"].get_support()
mb_selected_feats = X_train_enc.columns[model_based_mask]
fs = mb_selected_feats.tolist()

In [208]:
fs

['YearsCode',
 'YearsCodePro',
 'Country_United States of America',
 'OrgSize_1,000 to 4,999 employees',
 'OrgSize_10 to 19 employees',
 'OrgSize_10,000 or more employees',
 'OrgSize_2 to 9 employees',
 'OrgSize_20 to 99 employees',
 'OrgSize_5,000 to 9,999 employees',
 'OrgSize_I don’t know',
 'OrgSize_Just me - I am a freelancer, sole proprietor, etc.',
 'OrgSize_nan',
 'RemoteWork_Full in-person',
 'RemoteWork_Fully remote',
 'RemoteWork_Hybrid (some remote, some in-person)',
 'RemoteWork_nan',
 'DevType_academic researcher',
 'DevType_blockchain',
 'DevType_data scientist or machine learning specialist',
 'DevType_designer',
 'DevType_developer, mobile',
 'DevType_developer, qa or test',
 'DevType_devops specialist',
 'DevType_devtype_unspecified',
 'DevType_educator',
 'DevType_engineer, site reliability',
 'DevType_engineering manager',
 'DevType_product manager',
 'DevType_project manager',
 'DevType_scientist',
 'DevType_senior executive (c-suite, vp, etc.)',
 'DevType_student'

# Model Selection

In [209]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, cross_validate

In [210]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [211]:
cross_val_results_reg = {}
cross_val_results_reg_fs = {}

models = {
    "Baseline": DummyRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Ridge": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Lasso": Lasso(),
}

score_types_reg = {
    #"neg_mean_squared_error": "neg_mean_squared_error",
    #"neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "neg_mape": "neg_mean_absolute_percentage_error", 
    "r2": "r2",
}

In [212]:
def corss_validate_result(model_name, model_type, fs):
    pipe = make_pipeline(
        preprocessor,
        model_type
    )
    cross_val_results_reg[model_name] = pd.DataFrame(cross_validate(pipe,
                                                                    X_train,
                                                                    y_train, 
                                                                    return_train_score=True, 
                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T
    
    # cross_val_results_reg_fs[model_name] = pd.DataFrame(cross_validate(pipe,
    #                                                                    X_train[fs],
    #                                                                    y_train, 
    #                                                                    return_train_score=True, 
    #                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T

In [213]:
for model_item in models.items():
    model_name = model_item[0]
    model_type = model_item[1]
    corss_validate_result(model_name, model_type, fs)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [214]:
pd.concat(
    {key: pd.DataFrame(value) for key, value in cross_val_results_reg.items()}, 
    axis=1
)

Unnamed: 0_level_0,Baseline,Baseline,KNN Regressor,KNN Regressor,Ridge,Ridge,Random Forest Regressor,Random Forest Regressor,Lasso,Lasso
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
fit_time,0.762,0.139,1.592,0.097,1.225,0.252,272.931,3.608,11.971,0.057
score_time,0.215,0.054,1.923,0.311,0.343,0.065,0.356,0.011,0.296,0.018
test_neg_mean_absolute_percentage_error,-23.073,36.162,-24.616,37.464,-22.684,37.116,-18.983,25.774,-22.502,36.674
train_neg_mean_absolute_percentage_error,-23.044,9.016,-16.648,5.903,-23.215,9.459,-7.262,2.476,-23.162,9.412
test_r2,-0.001,0.001,0.154,0.023,0.315,0.019,0.325,0.016,0.313,0.021
train_r2,0.0,0.0,0.436,0.01,0.377,0.004,0.906,0.001,0.377,0.004


In [215]:
# pd.concat(
#     {key: pd.DataFrame(value) for key, value in cross_val_results_reg_fs.items()}, 
#     axis=1
# )

In [216]:
mean_std_cross_val_scores(
    pipe_rf_model_based, X_train, y_train, return_train_score=True
)

fit_time       68.495 (+/- 4.179)
score_time      0.343 (+/- 0.053)
test_score      0.270 (+/- 0.024)
train_score     0.898 (+/- 0.008)
dtype: object