In [1]:
import os
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    average_precision_score, 
    auc
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.impute import SimpleImputer
from sklearn.metrics import ConfusionMatrixDisplay
from scipy.stats import expon, lognorm, loguniform, randint, uniform
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
import altair as alt

In [2]:
# Code snippet copied from https://gist.github.com/jlln/338b4b0b55bd6984f883
def splitDataFrameList(df, target_column, separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator, target_column, separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

# Code copied from 573 lecture notes
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [25]:
survey_data = pd.read_csv('../data/raw/survey_results_public.csv')

In [26]:
# country_selected = ['United States','Canada']
survey_data = survey_data.query("Country == 'Canada' & Employment == 'Employed full-time' & Student == 'No'")
survey_data = survey_data.query("YearsCodePro != 'Less than 1 year' and YearsCodePro != 'More than 50 years'")

# lower = 5000
# upper = 100_000
# survey_data = survey_data.query("CompTotal >= @lower & CompTotal <= @upper")

# survey_data = survey_data.query("CompFreq == 'Yearly'")
survey_data = survey_data.dropna(subset=["YearsCodePro", "ConvertedComp", "DevType"])
# survey_data['salary_log'] = survey_data["ConvertedComp"]  #np.log(survey_data['ConvertedComp'])
survey_data = survey_data.astype({"YearsCodePro": "int"})

q = survey_data["ConvertedComp"].quantile(0.92)
survey_data = survey_data[survey_data["ConvertedComp"] < q]


survey_data = survey_data[["DevType", "EdLevel", "YearsCodePro", "LanguageWorkedWith", "ConvertedComp"]]

# survey_data = splitDataFrameList(survey_data, "LanguageWorkedWith", ";")
survey_data = splitDataFrameList(survey_data, "DevType", ";")

In [27]:
train_df, test_df = train_test_split(survey_data, test_size=0.2, random_state=123)

train_df.to_csv("../data/preprocessed/training.csv", index=False)

In [6]:
alt.data_transformers.enable('data_server')
# alt.Chart(train_df).mark_bar().encode(
#     x= alt.X("ConvertedComp", bin=alt.Bin(maxbins = 10)),
#     y = "count()"
# )

# (alt.Chart(train_df)
#  .transform_density(
#     'ConvertedComp',
#     as_=['ConvertedComp', 'density'])  # Give the name "density" the KDE columns we just created
#  .mark_area(interpolate='monotone').encode(
#     x=alt.X('ConvertedComp', scale=alt.Scale(type='log')),
#     y='density:Q')
# )

density_plot = (alt.Chart(train_df)
 .transform_density(
    'salary_log',
    as_=['salary_log', 'density'])  # Give the name "density" the KDE columns we just created
 .mark_area(interpolate='monotone').encode(
    x=alt.X('salary_log'),
    y='density:Q')
)



density_plot | alt.Chart(train_df).mark_line().encode(
    x = "YearsCodePro",
    y = "mean(salary_log)"
) 

In [7]:
X_train = train_df.drop(columns=["salary_log"])
y_train = train_df["salary_log"]
train_df.shape

(3748, 5)

In [8]:
y_train.head()

948     68705.0
4429    74048.0
1317    76339.0
80      51147.0
996     49600.0
Name: salary_log, dtype: float64

In [11]:
results = {}
numeric_features = ["YearsCodePro"]
categorical_features = ["DevType", "EdLevel", "LanguageWorkedWith"]

numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
categorical_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value="missing"),
                                       OneHotEncoder(sparse=False, handle_unknown="ignore"))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features)
)

pipe = make_pipeline(preprocessor, DummyRegressor())
results["Dummy"] = mean_std_cross_val_scores(pipe, X_train, y_train, cv=5, return_train_score=True)

In [12]:
pipe_ridge = make_pipeline(preprocessor, Ridge())
results["Ridge"] = mean_std_cross_val_scores(pipe_ridge, X_train, y_train, cv=5, return_train_score=True)

In [13]:
pd.DataFrame(results)

Unnamed: 0,Dummy,Ridge
fit_time,0.044 (+/- 0.028),0.081 (+/- 0.015)
score_time,0.015 (+/- 0.006),0.010 (+/- 0.001)
test_score,-0.002 (+/- 0.002),0.675 (+/- 0.030)
train_score,0.000 (+/- 0.000),0.815 (+/- 0.006)


### TODO
- Try different scoring metrics
- Try simple linear regression with one column
- Search analysis by others on the internet
    https://towardsdatascience.com/lets-learn-from-the-stackoverflow-survey-7f3eaf7db4b6