In [8]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [9]:
cols_to_choose = [
    'MainBranch',
    'Employment',
    'RemoteWork',
    'EdLevel',
    'YearsCode',
    'YearsCodePro',
    'DevType',
    'OrgSize',
    'Country',
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith',
    'PlatformHaveWorkedWith',
    'WebframeHaveWorkedWith',
    'MiscTechHaveWorkedWith',
    'ToolsTechHaveWorkedWith',
    'NEWCollabToolsHaveWorkedWith',
    'OpSysProfessional use',
    'VersionControlSystem',
    'VCInteraction',
    'OfficeStackAsyncHaveWorkedWith',
    'Age',
    'WorkExp',
    'ICorPM',
    'ConvertedCompYearly']

In [10]:
# order for ordinal columns
education_order = ['Something else', 'Primary/elementary school', 'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)',
                   'Some college/university study without earning a degree', 'Associate degree (A.A., A.S., etc.)', "Bachelor’s degree (B.A., B.S., B.Eng., etc.)", 
                   "Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",'Professional degree (JD, MD, etc.)', 'Other doctoral degree (Ph.D., Ed.D., etc.)']

age_order = ['Prefer not to say', 'Under 18 years old', '18-24 years old','25-34 years old','35-44 years old','45-54 years old', '55-64 years old','65 years or older']

In [11]:
numeric_cols = ['YearsCode', 'YearsCodePro', 'WorkExp']

ordinal_edu = ['EdLevel']

ordinal_age = ['Age']

binary_cols = ['MainBranch', 'Country']

categorical_cols = ['OrgSize', 'RemoteWork']

multianswer_cols = [
    'DevType',
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith',
    'PlatformHaveWorkedWith',
    'WebframeHaveWorkedWith',
    'MiscTechHaveWorkedWith',
    'ToolsTechHaveWorkedWith',
    'NEWCollabToolsHaveWorkedWith',
    'OpSysProfessional use',
    'VCInteraction',
    'VersionControlSystem',
    'OfficeStackAsyncHaveWorkedWith',
    'Employment']


passthrough_cols = ['ConvertedCompYearly']

drop_cols = ['ICorPM']

In [12]:
df_raw = pd.read_csv('data/raw/survey_results_public.csv')
north_america_data = df_raw.query("Country == 'United States of America' or Country == 'Canada'")
north_america_data = north_america_data[cols_to_choose]
north_america_data= north_america_data.query('ConvertedCompYearly < 500000')

In [13]:
# north_america_data['MainBranch'].unique()

In [14]:
def write_na_values_for_cols(df, list_of_cols, fillna_base_text = 'unspecified'):
    temp_df = df.copy()
    for col in list_of_cols:
        fillna_text_final = col + "_" + fillna_base_text
        temp_df[col] = temp_df[col].fillna(fillna_text_final)
        
    return temp_df

In [15]:
# Returns float values for different string inputs
def convert2float(x):
    if  x == 'More than 50 years' :
        return float(50)
    elif x == 'Less than 1 year':
        return float(0)
    else:
        return float(x)

In [16]:
def get_column_names_from_preporcessor(preprocessor):
    transformed_column_names = []
    for i in range(1,6):
        temp_names = preprocessor.named_transformers_['pipeline-'+str(i)].get_feature_names_out().tolist()
        transformed_column_names += temp_names

    for i in range(1,14):
        temp_names = preprocessor.named_transformers_['countvectorizer-'+str(i)].get_feature_names_out().tolist()
        # print(temp_names)
        for name in temp_names:
            name = multianswer_cols[i-1] + "_" + name
            transformed_column_names.append(name)

    transformed_column_names.append('ConvertedCompYearly')
    # print(transformed_column_names)

    return transformed_column_names

In [17]:
df_filtered = north_america_data
df_filtered = write_na_values_for_cols(df_filtered, multianswer_cols)

In [18]:
train_df_filtered, test_df_filtered = train_test_split(df_filtered, test_size=0.10, random_state=522)

train_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8558 entries, 15874 to 13613
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   MainBranch                      8558 non-null   object 
 1   Employment                      8558 non-null   object 
 2   RemoteWork                      8544 non-null   object 
 3   EdLevel                         8558 non-null   object 
 4   YearsCode                       8543 non-null   object 
 5   YearsCodePro                    8534 non-null   object 
 6   DevType                         8558 non-null   object 
 7   OrgSize                         8555 non-null   object 
 8   Country                         8558 non-null   object 
 9   LanguageHaveWorkedWith          8558 non-null   object 
 10  DatabaseHaveWorkedWith          8558 non-null   object 
 11  PlatformHaveWorkedWith          8558 non-null   object 
 12  WebframeHaveWorkedWith       

In [19]:
# train_df_filtered

In [20]:
# def multianswer_col_trans(df, col_name):
    
#     cv = CountVectorizer(tokenizer=lambda text: text.split(';'))
#     df[col_name] = df[col_name].fillna('unspecified')
#     sr_col_name = df[col_name]
#     cv.fit(sr_col_name)
#     columns = cv.get_feature_names_out()
#     df_encoded = pd.DataFrame(cv.transform(sr_col_name).toarray(),
#                         columns= columns,
#                         index = sr_col_name.index)
#     df_encoded = df_encoded.add_prefix(col_name + "_")
#     return df_encoded

In [21]:
# train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].replace(np.nan, train_df_filtered['YearsCode'].mode()[0])
# train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].replace(np.nan, train_df_filtered['YearsCodePro'].mode()[0])
# test_df_filtered['YearsCode'] = test_df_filtered['YearsCode'].replace(np.nan, test_df_filtered['YearsCode'].mode()[0])
# test_df_filtered['YearsCodePro'] = test_df_filtered['YearsCodePro'].replace(np.nan, test_df_filtered['YearsCodePro'].mode()[0])

In [22]:
# converts string year values to float
train_df_filtered['YearsCode'] = train_df_filtered['YearsCode'].apply(lambda x: convert2float(x))
train_df_filtered['YearsCodePro'] = train_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))
test_df_filtered['YearsCode'] = test_df_filtered['YearsCode'].apply(lambda x: convert2float(x))
test_df_filtered['YearsCodePro'] = test_df_filtered['YearsCodePro'].apply(lambda x: convert2float(x))

In [23]:
# replace nan values from the multianswer_cols
# train_df_filtered = write_na_values_for_cols(train_df_filtered, multianswer_cols)

In [24]:
# train_df_filtered = write_na_values_for_cols(train_df_filtered, ['MainBranch', 'RemoteWork', 'OrgSize'])

In [25]:
# train_df_filtered.query('YearsCode.isnull()')
# train_df_filtered.describe(include='all')

In [26]:
len(train_df_filtered['EdLevel'].unique())

9

In [27]:
len(test_df_filtered['EdLevel'].unique())

9

In [28]:
numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

ordinal_age_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), OrdinalEncoder(categories=[age_order], dtype=int))

binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore', dtype=int))

categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))

preprocessor = make_column_transformer(
            (numeric_transformer, numeric_cols),
            (ordinal_edu_transformer, ordinal_edu),
            (ordinal_age_transformer, ordinal_age),
            (binary_transformer, binary_cols),
            (categorical_transformer, categorical_cols),
            ('drop', drop_cols),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[0]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[1]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[2]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[3]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[4]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[5]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[6]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[7]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[8]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[9]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[10]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[11]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[12]),
            ('passthrough', passthrough_cols)
)

train_df_filtered_encode = preprocessor.fit_transform(train_df_filtered).todense()
train_df_filtered_encode

test_df_filtered_encode = preprocessor.fit_transform(test_df_filtered).todense()
test_df_filtered_encode

matrix([[ 6.16673213e-01,  1.00024528e+00, -4.80507331e-01, ...,
          1.00000000e+00,  0.00000000e+00,  2.00000000e+05],
        [-2.80460374e-01, -8.80640556e-01, -9.31605526e-01, ...,
          0.00000000e+00,  0.00000000e+00,  1.10000000e+05],
        [-1.17759396e+00, -9.79634548e-01, -4.80507331e-01, ...,
          0.00000000e+00,  0.00000000e+00,  1.00000000e+05],
        ...,
        [-2.80460374e-01, -1.87682615e-01, -4.80507331e-01, ...,
          0.00000000e+00,  0.00000000e+00,  9.00000000e+04],
        [ 1.15495336e+00,  1.09299360e-01,  1.96139962e-01, ...,
          0.00000000e+00,  0.00000000e+00,  1.40551000e+05],
        [-1.08788060e+00, -1.17762253e+00, -1.15715462e+00, ...,
          0.00000000e+00,  0.00000000e+00,  9.00000000e+04]])

In [29]:
# numeric_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), StandardScaler())

# ordinal_edu_transformer = make_pipeline(OrdinalEncoder(categories=[education_order], dtype=int))

# ordinal_age_transformer = make_pipeline(OrdinalEncoder(categories=[age_order], dtype=int))

# binary_transformer = make_pipeline(OneHotEncoder(drop='if_binary', handle_unknown='ignore', dtype=int))

# categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse=False))


# preprocessor = make_column_transformer(
#                (numeric_transformer, numeric_cols),
#                (ordinal_edu_transformer, ordinal_edu),
#                (ordinal_age_transformer, ordinal_age),
#                (binary_transformer, binary_cols),
#                (categorical_transformer, categorical_cols),
#                # ('passthrough', passthrough_cols),
#                ('drop', drop_cols),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[0]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[1]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[2]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[3]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[4]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[5]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[6]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[7]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[8]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[9]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[10]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[11]),
#                (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[12])
# )

In [30]:
# view preprocessor
preprocessor

In [31]:
X_train = train_df_filtered.drop(columns=["ConvertedCompYearly"])
y_train = train_df_filtered["ConvertedCompYearly"]

X_test = test_df_filtered.drop(columns=["ConvertedCompYearly"])
y_test = test_df_filtered["ConvertedCompYearly"]

In [32]:
# preprocessor.get_feature_names_out

In [33]:
# fit preprocessor with train df
train_df_filtered_encode = preprocessor.fit_transform(train_df_filtered).todense()

# creates list of new column names from preprocessing pipelines for train df
transformed_column_names_train = get_column_names_from_preporcessor(preprocessor)

In [34]:
print(len(transformed_column_names_train))
# transformed_column_names

266


In [36]:
train_enc = pd.DataFrame(
    data=train_df_filtered_encode, 
    index=train_df_filtered.index, 
    columns=transformed_column_names_train
)
train_enc

Unnamed: 0,YearsCode,YearsCodePro,WorkExp,EdLevel,Age,"MainBranch_I am not primarily a developer, but I write code sometimes as part of my work",Country_United States of America,"OrgSize_1,000 to 4,999 employees",OrgSize_10 to 19 employees,"OrgSize_10,000 or more employees",...,OfficeStackAsyncHaveWorkedWith_trello,OfficeStackAsyncHaveWorkedWith_wimi,OfficeStackAsyncHaveWorkedWith_workzone,OfficeStackAsyncHaveWorkedWith_wrike,"Employment_employed, full-time","Employment_employed, part-time",Employment_i prefer not to say,"Employment_independent contractor, freelancer, or self-employed",Employment_retired,ConvertedCompYearly
15874,-0.909207,-1.010687,-1.195821,5.0,2.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,130000.0
3051,-0.628173,-0.694879,-0.261810,5.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,148360.0
63736,-0.534495,-0.168532,-0.261810,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,78084.0
12895,-0.628173,-0.484340,-0.261810,5.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,74180.0
30849,-0.815529,-0.694879,-0.962318,5.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,75000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39712,-0.628173,-0.905417,-0.845567,6.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,95000.0
63757,-0.628173,-0.484340,-0.261810,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,80000.0
38135,1.058033,2.147394,-0.261810,5.0,6.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,130000.0
24543,-0.066104,-0.168532,-0.261810,5.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,300000.0


In [None]:
# fit preprocessor with test df
test_df_filtered_encode = preprocessor.fit_transform(test_df_filtered).todense()

# creates list of new column names from preprocessing pipelines for test df
transformed_column_names_test = get_column_names_from_preporcessor(preprocessor)

In [None]:
test_enc = pd.DataFrame(
    data=test_df_filtered_encode, 
    index=test_df_filtered.index, 
    columns=transformed_column_names_test
)
test_enc

In [None]:
# # fit and transform the columns that dont contain multiple answers per response

# transformed_train_df = pd.DataFrame(
#                        preprocessor.fit_transform(non_multianswer_train_df),
#                        columns = transformed_column_names)

# transformed_train_df

In [None]:
# # creates final transformed train df

# for column in multianswer_cols:
#     # count vectorizer for multi answer columns
#     temp_col = multianswer_col_trans(train_df_filtered, column)
#     temp_col.reset_index(drop=True, inplace=True)
#     transformed_train_df = pd.concat([transformed_train_df, temp_col], axis=1)

# transformed_train_df


# Feature Selection

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

In [None]:
xx

preprocessor = make_column_transformer(
            (numeric_transformer, numeric_cols),
            (ordinal_edu_transformer, ordinal_edu),
            (ordinal_age_transformer, ordinal_age),
            (binary_transformer, binary_cols),
            (categorical_transformer, categorical_cols),
            # ('passthrough', passthrough_cols),
            ('drop', drop_cols),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[0]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[1]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[2]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[3]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[4]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[5]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[6]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[7]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[8]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[9]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[10]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[11]),
            (CountVectorizer(tokenizer=lambda text: text.split(';')), multianswer_cols[12])
    )

In [None]:
# fit preprocessor with test df
X_train_encode = preprocessor.fit_transform(X_train).todense()

# creates list of new column names from preprocessing pipelines for test df
transformed_column_names = get_column_names_from_preporcessor(preprocessor)

X_train_enc = pd.DataFrame(
    data=X_train_encode, 
    index=X_train.index, 
    columns=transformed_column_names[:-1]
)
X_train_enc

In [None]:
select_lr = SelectFromModel(Ridge(), threshold="0.8*mean")

pipe_rf_model_based = make_pipeline(
    preprocessor, select_lr, RandomForestRegressor(random_state=16)
)
pipe_rf_model_based.fit(X_train, y_train)

In [None]:
model_based_mask = pipe_rf_model_based.named_steps["selectfrommodel"].get_support()
mb_selected_feats = X_train_enc.columns[model_based_mask]
fs = mb_selected_feats.tolist()

In [None]:
fs

# Model Selection

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, cross_validate

In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
cross_val_results_reg = {}
cross_val_results_reg_fs = {}

models = {
    "Baseline": DummyRegressor(),
    "KNN Regressor": KNeighborsRegressor(),
    "Ridge": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Lasso": Lasso(),
}

score_types_reg = {
    #"neg_mean_squared_error": "neg_mean_squared_error",
    #"neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "neg_mape": "neg_mean_absolute_percentage_error", 
    "r2": "r2",
}

In [None]:
def corss_validate_result(model_name, model_type, fs):
    pipe = make_pipeline(
        preprocessor,
        model_type
    )
    cross_val_results_reg[model_name] = pd.DataFrame(cross_validate(pipe,
                                                                    X_train,
                                                                    y_train, 
                                                                    return_train_score=True, 
                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T
    
    # cross_val_results_reg_fs[model_name] = pd.DataFrame(cross_validate(pipe,
    #                                                                    X_train[fs],
    #                                                                    y_train, 
    #                                                                    return_train_score=True, 
    #                                                                    scoring=list(score_types_reg.values()))).agg(['mean', 'std']).round(3).T

In [None]:
for model_item in models.items():
    model_name = model_item[0]
    model_type = model_item[1]
    corss_validate_result(model_name, model_type, fs)

In [None]:
pd.concat(
    {key: pd.DataFrame(value) for key, value in cross_val_results_reg.items()}, 
    axis=1
)

In [None]:
# pd.concat(
#     {key: pd.DataFrame(value) for key, value in cross_val_results_reg_fs.items()}, 
#     axis=1
# )

In [None]:
mean_std_cross_val_scores(
    pipe_rf_model_based, X_train, y_train, return_train_score=True
)