In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_categorical
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, make_scorer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import lightgbm as lgm
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import scipy
from scipy.cluster import hierarchy as hc

In [3]:
# used functionality 

def mad(x,y): return sum(abs(x-y))/len(x)

def handle_multi_string_columns(df, column, single_strings):
    '''
    Replaces column whose fields contain several strings with new columns. Each
    new column will then represent a single string
    
    INPUT:
    df - the pandas dataframe you want to search
    column - the column name you want to look through
    single_strings - a list of strings you want to search for in each row of df[col]

    OUTPUT:
    new_df - The dataframe without the multi-string column but with the newly created columns
    col_dict - Dictionary translating names of the new columns to their corresponding string
    '''
    
    #collects new columns of indicating if a certain index refers to a string 
    new_columns = dict()
    
    #dict column name -> string name
    col_dict = dict()
    
    #loop through list of strings
    counter = 0
    for string in single_strings:
        bool_list = []
        #loop through rows
        for idx in range(df.shape[0]):
            #if the ed type is in the row set to True
            if string in str(df[column][idx]):
                bool_list.append(1)
            else:
                bool_list.append(0)
        col_name = column + "_" + str(counter)
        new_columns[col_name] = bool_list
        col_dict[col_name] = string
        counter = counter + 1
    
    new_df = df.drop(column,axis=1)
    
    new_df = pd.concat([new_df, pd.DataFrame(data=new_columns, index = df.index, dtype=int)], axis=1)
    
    return new_df, col_dict


def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

def mean_absolute_error_2(y_true, y_pred):
    return mean_absolute_error(y_true, np.round(y_pred))


def classification(y_train, y_test_preds):
    y_train_min = np.min(y_train)
    y_train_max = np.max(y_train)
    total_distance = y_train_max - y_train_min
    point_0 = y_train_min + total_distance * np.sum(y_train == 0) / len(y_train)
    point_1 = point_0 + total_distance * np.sum(y_train == 1) / len(y_train)
    point_2 = point_1 + total_distance * np.sum(y_train == 2) / len(y_train)
    point_3 = point_2 + total_distance * np.sum(y_train == 3) / len(y_train)
    point_4 = y_train_max
    
    for i in range(len(y_test_preds)):
        if y_test_preds[i] <= point_0:
            y_test_preds[i] = 0
        elif y_test_preds[i] <= point_1:
            y_test_preds[i] = 1
        elif y_test_preds[i] <= point_2:
            y_test_preds[i] = 2
        elif y_test_preds[i] <= point_3:
            y_test_preds[i] = 3
        elif y_test_preds[i] <= point_4:
            y_test_preds[i] = 4
        else:
            print("An error occurred!")
            break
    
        return y_test_preds

In [5]:
df_raw = pd.read_csv('data/survey_results_public.csv', low_memory=False)
schema = pd.read_csv('data/survey_results_schema.csv')

In [6]:
df = df_raw.dropna(subset=['JobSat'], axis=0)
df = df[df.Employment.isin(['Employed full-time', 'Employed part-time'])]
df = df[df.Student == 'No']
df = df.drop(['CareerSat','Respondent','ResumeUpdate','CurrencySymbol','CurrencyDesc','CompTotal','SurveyEase','SurveyLength','SONewContent','WelcomeChange','SOComm','EntTeams','SOVisit1st',
                 'SOVisitFreq', 'SOVisitTo', 'SOFindAnswer', 'SOTimeSaved', 'SOHowMuchTime', 'SOAccount','SOPartFreq', 'SOJobs', 'LanguageWorkedWith', 'LanguageDesireNextYear',
                 'DatabaseWorkedWith', 'DatabaseDesireNextYear', 'PlatformWorkedWith', 'PlatformDesireNextYear', 'WebFrameWorkedWith', 'WebFrameDesireNextYear', 'MiscTechWorkedWith', 
                 'MiscTechDesireNextYear', 'DevEnviron','Student','Ethnicity','JobSeek','MgrIdiot'], axis=1)
df = df.dropna(thresh=45)
df = df.reset_index(drop=True)

In [7]:
EduOther_strings = ['Taken an online course in programming or software development (e.g. a MOOC)','Participated in a fulltime developer training program or bootcamp',
                    'Taken a part-time in-person course in programming or software development','Completed an industry certification program (e.g. MCPD)',
                    'Received on-the-job training in software development','Taught yourself a new language, framework, or tool without taking a formal course',
                    'Participated in online coding competitions (e.g. HackerRank, CodeChef, TopCoder)','Participated in a hackathon','Contributed to open source software',
                    'None of these']

DevType_strings = ['Academic researcher','Data or business analyst','Data scientist or machine learning specialist','Database administrator','Designer','Developer, back-end',
                   'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack',
                   'Developer, game or graphics','Developer, mobile','Developer, QA or test','DevOps specialist','Educator','Engineer, data','Engineer, site reliability',
                   'Engineering manager','Marketing or sales professional','Product manager','Scientist',"Senior Executive (C-Suite, VP, etc.)", 'Student', 'System administrator']

LastInt_strings = ['Write any code','Write code by hand (e.g., on a whiteboard)','Complete a take-home project','Solve a brain-teaser style puzzle','Interview with people in peer roles',
                   'Interview with people in senior / management roles']

WorkChallenge_strings = ['Distracting work environment','Being tasked with non-development work','Meetings','Time spent commuting','Not enough people for the workload',
                         'Toxic work environment','Inadequate access to necessary tools','Lack of support from management','Non-work commitments (parenting, school work, hobbies, etc.)']

JobFactors_strings = ['Diversity of the company or organization',"Languages, frameworks, and other technologies I'd be working with","Industry that I'd be working in",
                      'How widely used or impactful my work output would be',"Specific department or team I'd be working on",'Flex time or a flexible schedule',
                      'Remote work options','Financial performance or funding status of the company or organization','Office environment or company culture',
                      'Opportunities for professional development']

df, dict_EduOther = handle_multi_string_columns(df,"EduOther",EduOther_strings)
df, dict_DevType = handle_multi_string_columns(df,"DevType",DevType_strings)
df, dict_LastInt = handle_multi_string_columns(df,"LastInt",LastInt_strings)
df, dict_WorkChallenge = handle_multi_string_columns(df,"WorkChallenge",WorkChallenge_strings)
df, dict_JobFactors = handle_multi_string_columns(df,"JobFactors",JobFactors_strings)

In [8]:
for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

df.JobSat.cat.set_categories(['Very dissatisfied','Slightly dissatisfied','Neither satisfied nor dissatisfied','Slightly satisfied', 'Very satisfied'], ordered=True, inplace=True)
df.CareerSat.cat.set_categories(['Very dissatisfied','Slightly dissatisfied','Neither satisfied nor dissatisfied','Slightly satisfied', 'Very satisfied'], ordered=True, inplace=True)
#df.JobSeek.cat.set_categories(['I am not interested in new job opportunities','I’m not actively looking, but I am open to new opportunities','I am actively looking for a job'], ordered=True, inplace=True)

In [9]:
df_indicator = df.isnull().astype(int).add_suffix('_nan')
df = pd.concat([df, df_indicator], axis=1)

In [10]:
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

fill_mean = lambda col: col.fillna(col.mean())
# Fill the mean
df = df.apply(fill_mean, axis=0)

In [11]:
#Drop columns with all NaN values
df = df.dropna(how='all', axis=0)
df = df.dropna(how='all', axis=1)
#delete columns that add up to 0
df = df.loc[:, (df != 0).any(axis=0)]

In [12]:
X = df.drop(['JobSat'], axis=1)
y = df['JobSat']

X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size=0.25, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test_valid, y_test_valid, test_size=0.25, random_state=42)

In [86]:
model = lgm.LGBMRegressor()
model.fit(X_train, y_train)

ValueError: Should set group for ranking task

In [52]:
#Predict using your model
y_test_preds = model.predict(X_test)
y_train_preds =model.predict(X_train)
#Score using your model
test_r2 = r2_score(y_test, y_test_preds)
train_r2 = r2_score(y_train, y_train_preds)
# train_mad = mad(y_train, classification(y_train, y_train_preds))
# test_mad = mad(y_test, classification(y_train, y_test_preds))

correct = lambda x, multi, add: multi * x + add
corrector = 0 
train_mad = 1
for add in range(-100, 100):
    for multi in range (0, 200):
        new_mad = mad(y_train, np.round(correct(y_train_preds, multi/100, add/100)))
        if new_mad < train_mad:
            train_mad = new_mad
            multi_final = multi / 100
            add_final = add / 100

test_mad = mad(y_test, np.round(correct(y_test_preds, multi_final,add_final)))

print("Multiplier: {}".format(multi_final))
print("Scalar: {}".format(add_final))
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_r2, test_r2))
print("The mad on the training data was {}.  The mad on the test data was {}.".format(train_mad, test_mad))

Multiplier: 1.42
Scalar: -1.0
The rsquared on the training data was 0.27753058024364674.  The rsquared on the test data was 0.16313177958100977.
The mad on the training data was 0.7786937048333471.  The mad on the test data was 0.8740520936366634.


In [15]:
[np.sum(np.round(y_test_preds + corrector) == 0) , np.sum(np.round(y_test_preds + corrector) == 1), np.sum(np.round(y_test_preds + corrector) == 2), np.sum(np.round(y_test_preds + corrector) == 3), np.sum(np.round(y_test_preds + corrector) == 4)]

[0, 22, 1499, 5979, 1599]

In [16]:
y_g_0 = y.apply(lambda k: 0 if k<=0 else 1) # probability greater 0
y_g_1 = y.apply(lambda k: 0 if k<=1 else 1) # probability greater 1
y_g_2 = y.apply(lambda k: 0 if k<=2 else 1) # probability greater 2
y_g_3 = y.apply(lambda k: 0 if k<=3 else 1) # probability greater 3

y_g_0_train = y_g_0.loc[y_train.index]
y_g_1_train = y_g_1.loc[y_train.index]
y_g_2_train = y_g_2.loc[y_train.index]
y_g_3_train = y_g_3.loc[y_train.index]

y_g_0_test = y_g_0.loc[y_test.index]
y_g_1_test = y_g_1.loc[y_test.index]
y_g_2_test = y_g_2.loc[y_test.index]
y_g_3_test = y_g_3.loc[y_test.index]

y_g_0_valid = y_g_0.loc[y_valid.index]
y_g_1_valid = y_g_1.loc[y_valid.index]
y_g_2_valid = y_g_2.loc[y_valid.index]
y_g_3_valid = y_g_3.loc[y_valid.index]

In [17]:
model_0 = lgm.LGBMRegressor()
model_0.fit(X_train, y_g_0_train)

#Predict using your model
y_g_0_train_preds =model_0.predict(X_train)
y_g_0_test_preds = model_0.predict(X_test)

#Score using your model
train_0_score = r2_score(y_g_0_train, y_g_0_train_preds)
test_0_score = r2_score(y_g_0_test, y_g_0_test_preds)

train_0_mad = mad(y_g_0_train, np.round(y_g_0_train_preds))
test_0_mad = mad(y_g_0_test, np.round(y_g_0_test_preds))

print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_0_score, test_0_score))
print("The mad on the training data was {}.  The mad on the test data was {}.".format(train_0_mad, test_0_mad))

The rsquared on the training data was 0.22292569493738645.  The rsquared on the test data was 0.023159726953167592.
The mad on the training data was 0.06402330118429368.  The mad on the test data was 0.06726013847675569.


In [18]:
model_1 = lgm.LGBMRegressor()
model_1.fit(X_train, y_g_1_train)

#Predict using your model
y_g_1_train_preds =model_1.predict(X_train)
y_g_1_test_preds = model_1.predict(X_test)

#Score using your model
train_1_score = r2_score(y_g_1_train, y_g_1_train_preds)
test_1_score = r2_score(y_g_1_test, y_g_1_test_preds)

train_1_mad = mad(y_g_1_train, np.round(y_g_1_train_preds))
test_1_mad = mad(y_g_1_test, np.round(y_g_1_test_preds))

print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_1_score, test_1_score))
print("The mad on the training data was {}.  The mad on the test data was {}.".format(train_1_mad, test_1_mad))

The rsquared on the training data was 0.22847372696399826.  The rsquared on the test data was 0.09556611754870326.
The mad on the training data was 0.18569505124611876.  The mad on the test data was 0.22035388504231235.


In [19]:
model_2 = lgm.LGBMRegressor()
model_2.fit(X_train, y_g_2_train)

#Predict using your model
y_g_2_train_preds =model_2.predict(X_train)
y_g_2_test_preds = model_2.predict(X_test)

#Score using your model
train_2_score = r2_score(y_g_2_train, y_g_2_train_preds)
test_2_score = r2_score(y_g_2_test, y_g_2_test_preds)

train_2_mad = mad(y_g_2_train, np.round(y_g_2_train_preds))
test_2_mad = mad(y_g_2_test, np.round(y_g_2_test_preds))

print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_2_score, test_2_score))
print("The mad on the training data was {}.  The mad on the test data was {}.".format(train_2_mad, test_2_mad))

The rsquared on the training data was 0.24782716687712625.  The rsquared on the test data was 0.13208735024867757.
The mad on the training data was 0.2404308520869398.  The mad on the test data was 0.2919002088141554.


In [20]:
model_3 = lgm.LGBMRegressor()
model_3.fit(X_train, y_g_3_train)

#Predict using your model
y_g_3_train_preds =model_3.predict(X_train)
y_g_3_test_preds = model_3.predict(X_test)

#Score using your model
train_3_score = r2_score(y_g_3_train, y_g_3_train_preds)
test_3_score = r2_score(y_g_3_test, y_g_3_test_preds)

train_3_mad = mad(y_g_3_train, np.round(y_g_3_train_preds))
test_3_mad = mad(y_g_3_test, np.round(y_g_3_test_preds))

print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_3_score, test_3_score))
print("The mad on the training data was {}.  The mad on the test data was {}.".format(train_3_mad, test_3_mad))

The rsquared on the training data was 0.2511245086581584.  The rsquared on the test data was 0.12992872439370173.
The mad on the training data was 0.24103536394361552.  The mad on the test data was 0.28552588196505113.


In [23]:
type(y_g_0_test_preds)

numpy.ndarray

In [55]:
def categorical_prob(y_g_0_preds,y_g_1_preds,y_g_2_preds,y_g_3_preds):
    y_0 = 1 - y_g_0_preds
    y_1 = 1 - y_g_1_preds - y_0
    y_2 = 1 - y_g_2_preds - y_0 - y_1
    y_3 = 1 - y_g_3_preds - y_0 - y_1 - y_2
    y_4 = 1 - y_0 - y_1 - y_2 - y_3
    #     y_4 = y_g_3_preds
    #     y_3 = y_g_2_preds - y_4
    #     y_2 = y_g_1_preds - y_4 - y_3
    #     y_1 = y_g_0_preds - y_4 - y_3 - y_2 
    #     y_0 = 1 - y_1 - y_2 - y_3 - y_4
    
    
#     adjust = np.vectorize(lambda x: np.maximum(x,0))
#     y_0 = adjust(y_0)
#     y_1 = adjust(y_1)
#     y_2 = adjust(y_2)
#     y_3 = adjust(y_3)
#     y_4 = adjust(y_4)
    for i in range(len(y_0)):
        if y_0[i] < 0:
            y_1[i] = y_1[i] - y_0[i]
            y_2[i] = y_2[i] - y_0[i]
            y_3[i] = y_3[i] - y_0[i]
            y_4[i] = y_4[i] - y_0[i]
            y_0[i] = 0
        if y_1[i] < 0:
            y_0[i] = y_0[i] - y_1[i]
            y_2[i] = y_2[i] - y_1[i]
            y_3[i] = y_3[i] - y_1[i]
            y_4[i] = y_4[i] - y_1[i]
            y_1[i] = 0
        if y_2[i] < 0:
            y_1[i] = y_1[i] - y_2[i]
            y_0[i] = y_0[i] - y_2[i]
            y_3[i] = y_3[i] - y_2[i]
            y_4[i] = y_4[i] - y_2[i]
            y_2[i] = 0
        if y_3[i] < 0:
            y_1[i] = y_1[i] - y_3[i]
            y_2[i] = y_2[i] - y_3[i]
            y_0[i] = y_0[i] - y_3[i]
            y_4[i] = y_4[i] - y_3[i]
            y_3[i] = 0
        if y_4[i] < 0:
            y_1[i] = y_1[i] - y_4[i]
            y_2[i] = y_2[i] - y_4[i]
            y_3[i] = y_3[i] - y_4[i]
            y_0[i] = y_0[i] - y_4[i]
            y_4[i] = 0
            

            
    y_normalize = y_0 + y_1 + y_2 + y_3 + y_4
    for i in range(len(y_0)):
        y_0[i] = y_0[i] / y_normalize[i]
        y_1[i] = y_1[i] / y_normalize[i]
        y_2[i] = y_2[i] / y_normalize[i]
        y_3[i] = y_3[i] / y_normalize[i]
        y_4[i] = y_4[i] / y_normalize[i]
    return y_0, y_1, y_2, y_3, y_4

In [56]:
y_0_test, y_1_test, y_2_test, y_3_test, y_4_test = categorical_prob(y_g_0_test_preds, y_g_1_test_preds, y_g_2_test_preds, y_g_3_test_preds)

In [85]:
prediction = np.empty_like(y_0_test)

expected_values = y_1_test * 1 + y_2_test * 2 + y_3_test * 3 + y_4_test * 4

for i in range(len(prediction)):
    heighest = y_0_test[i]
    prediction[i]=0
    if heighest < y_1_test[i]:
        heighest = y_1_test[i]
        prediction[i]=1
    if heighest < y_2_test[i]:
        heighest = y_2_test[i]
        prediction[i]=2
    if heighest < y_3_test[i]:
        heighest = y_3_test[i]
        prediction[i]=3
    if heighest < y_4_test[i]:
        heighest = y_4_test[i]
        prediction[i]=4
    if heighest < 0.50:
        prediction[i] = np.round(expected_values[i])

        
        
mad(prediction, y_test)

0.8763600395647874

In [79]:
[np.sum(y_test == 0) , np.sum(y_test == 1), np.sum(y_test == 2), np.sum(y_test == 3), np.sum(y_test == 4)]

[614, 1452, 954, 3213, 2866]

In [80]:
[np.sum(prediction == 0) , np.sum(prediction == 1), np.sum(prediction == 2), np.sum(prediction == 3), np.sum(prediction == 4)]

[2, 114, 2695, 4793, 1495]