# Importing important modules

In [1]:
import os
import sys
import lightgbm as lgb

sys.path.append('../')
from config import settings
from utils.eval_helpers import full_ranking_evaluation, compute_lr_feature_weights
from utils.feature_helpers import read_feature_names
from utils.feature_transformer import *
#import mlflow
from git import Repo
from datetime import datetime
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from collections import defaultdict
import joblib
from sklearn.preprocessing import StandardScaler
from utils.evaluation_utils import *
    
transformer_gbdt = FeatureTransformerGBDT()

  from .autonotebook import tqdm as notebook_tqdm


# Base feature generation for train and test sets

In [2]:
test_set_names = ['packet2start', 'packet2si', 'ms_si2start', 'ss_si2start', 'ms_search2start', 'ss_search2start']

The datasets used in this test run are taken from repo [gbdt_ranking_model](https://github.com/TuringEnterprises/gbdt_ranking_model) by Gultekin Gunduz. To prevent data duplication, I chose not to add the data to this repo, but you can access the above-mentioned repo to get it 

### Load and process train and test data set

In [5]:
data_path = "data/feature_set_march/"
train_set_path = f'{data_path}/feature_set_search2start.parquet'

if train_set_path.endswith('csv'):
    df_train = pd.read_csv(train_set_path)
else:
    df_train = pd.read_parquet(train_set_path)

transformer_gbdt = FeatureTransformerGBDT()
df_train = transformer_gbdt.further_feature_transformation(df_train)

In [6]:
dfs_test_sets = {}
for test_set_name in test_set_names:
    dfs_test_sets[test_set_name] = pd.read_parquet(
        os.path.join(f'{data_path}/feature_set_{test_set_name}.parquet')
    )
    dfs_test_sets[test_set_name] = transformer_gbdt.further_feature_transformation(dfs_test_sets[test_set_name])

In [76]:
allfeats  = ['atleast1_relevant_mcq',
 'atleast_1yoe_ratio',
 'atleast_2yoe_ratio',
 'atleast_3yoe_ratio',
 'avg_gte3_ti_score',
 'avg_log_skill_yoe',
 'final_score',
 'hourly_rate',
 'interview_score',
 'len_resume_raw',
 'max_mcq_pct',
 'max_relevant_mcq_pct',
 'max_skill_yoe',
 'mean_mcq_pct',
 'mean_relevant_mcq_pct',
 'min_relevant_mcq_pct',
 'num_mcq',
 'num_relevant_mcq',
 'num_relevant_projs',
#  'num_vetted',
 'relevant_mcq_take_rate',
 'seniority_avg',
 'starts_in_weeks',
 'sum_relevant_proj_words',
 'es_score',
 'num_mh_skills',
 'query_num_relevant_mcq']

mono_dict = {
    'leadsource':0,
    'atleast1_relevant_mcq': 1,
    'atleast_1yoe_ratio':1,
    'atleast_2yoe_ratio':1,
    'atleast_3yoe_ratio':1,
    'hourly_rate': -1,
    'interview_score':1,
    'len_resume':1,
    'len_resume_raw':1,
    'max_mcq_pct':1,
    'max_relevant_mcq_pct':1,
    'max_skill_yoe':1,
    'mean_mcq_pct':1,
    'min_relevant_mcq_pct':1,
    'num_relevant_mcq':1,
    'seniority_avg':1,
    'starts_in_weeks':-1,
    'num_relevant_projs':1,
    'sum_relevant_proj_words' :1,
    'not_selected_from_packet_count': -1 ,
    'relevant_mcq_take_rate':1,
    'mean_relevant_mcq_pct':1,
    'max_accept_rate_ratio_all':0,
    'max_accept_rate_ratio_all_sq':0,
    'region_1.Bay Area':0,
    'region_2.Rest of US':0,
    'region_4.Others':0,
    'client_category_1.Platinum':0,
    'client_category_4.Bronze':0,
    'acc_fire_count':0,
    'acc_opp_count':0,
    'acc_rc_count':0,
    'predicted_mcq_min':1,
    'predicted_mcq_max':1,
    'predicted_mcq_mean':1,
    'predicted_mcq_std':0,
    'dev_skill_yoe_min':1,
    'dev_skill_yoe_mean':1,
    'dev_skill_yoe_max' :1,
    'dev_selfDeclared_skill_yoe_min':1,
    'dev_selfDeclared_skill_yoe_mean':1,
    'dev_selfDeclared_skill_yoe_max':1,
    'dev_selfDeclared_skill_level_min':1,
    'dev_selfDeclared_skill_level_mean':1, 'dev_selfDeclared_skill_level_max':1,
    'ti_score':1, 'uni_rank_min_min':-1, 
    'correct_answer' :1, 'years_of_experience':1, 'salary_in_USD': -1,
    'country_citizenship_id': 0, 'seniority_4' :1, 'seniority_2' :1, 'seniority_1' :1,
    'seniority_7' :1, 'seniority_3' :1,
    'avg_gte3_ti_score':1,
    'avg_log_skill_yoe':1,
    'final_score':1,
    'interview_score':1,
    'latest_gte3_ti_score':1,
    'latest_ti_score':1,
    'len_resume_raw':1,
    'max_mcq_pct':1,
    'max_relevant_mcq_pct':1,
    'max_skill_yoe':1,
    'max_ti_score':1,
    'mean_mcq_pct':1,
    'mean_relevant_mcq_pct':1,
    'min_gte3_ti_score':1,
    'min_relevant_mcq_pct':1,
    'num_relevant_mcq':1,
    'num_relevant_projs':1,
    'num_ti_no_show':-1,
#     'num_vetted':1,
    'relevant_mcq_take_rate':1,
    'seniority_avg':1,
    'starts_in_weeks':-1,
    'sum_relevant_proj_words':1,
    'dev_rc_count':1,
    'dev_trial_count':1,
    'dev_start_count':1,
    'dev_fire_count':-1,
    'dev_complete_count':1,
    'dev_not_si_from_packet_count':-1,
    'dev_hard_reject_count':-1,
    'es_score':1,}


selected = ['mean_relevant_mcq_pct',
            'num_mcq',
            'num_relevant_projs',
            'starts_in_weeks',
            'atleast_2yoe_ratio',
            'avg_log_skill_yoe',
           'relevant_mcq_take_rate',
           'atleast_1yoe_ratio',
           'num_mh_skills',
           'min_relevant_mcq_pct',
           'hourly_rate',
           'num_relevant_mcq',
           'mean_mcq_pct',
           'interview_score',
           'seniority_avg']

In [8]:
target_column_map = {
'packet2start' : "is_start",
'packet2si' : "is_si",
'ms_search2start' : "is_start",
'ss_search2start' : "is_start",
'si2start' : "is_start",
'search2start_new' : 'is_start',
'search2start' : 'is_start',
'ms_si2start': 'is_start',
'ss_si2start': 'is_start',
    
}
test_label_cols = [target_column_map[test_set_name] for test_set_name in test_set_names]

In [9]:
#test_set_names = ['packet2start', 'packet2si', 'ms_si2start', 'ss_si2start', 'ms_search2start', 'ss_search2start']

unique_dev_job_pairs = []

for key, val in dfs_test_sets.items():
    
    for _, row in val[['job_id', 'developer_id']].iterrows():
        
        r = {}
        r['job_id'] = row['job_id']
        r['developer_id'] = row['developer_id']
        unique_dev_job_pairs.append(r.copy())
        
for _, row in df_train[['job_id', 'developer_id']].iterrows():
        
        r = {}
        r['job_id'] = row['job_id']
        r['developer_id'] = row['developer_id']
        unique_dev_job_pairs.append(r.copy())
        
unique_dev_job_pairs = pd.DataFrame(unique_dev_job_pairs)
print(unique_dev_job_pairs.shape)
unique_dev_job_pairs.drop_duplicates(inplace = True)
print(unique_dev_job_pairs.shape)

(920579, 2)
(328872, 2)


# MCQ Feature generation

In [10]:
unique_developers = tuple(unique_dev_job_pairs['developer_id'].unique().tolist())
len(unique_developers)

5651

In [11]:
def read_data_from_gbq(query, project_id='turing-230020'):
    return pd.io.gbq.read_gbq(query, project_id=project_id)

drop_list = list(set([
    'Work Experience Analysis',
    'English listening',
    'Designer',
    'QA',
    'Reading comprehension',
    'Personality Test',
    'MCQ Survey',
    'Automated Coding Challenge',
    'Golang-Brazil'] + ['Project Collaboration',
    'PHP (New Test)',
    'Work Experience [Advanced]',
    'Work Experience',
    'Work Experience [Basic]',
    'Mobile Development',
    'Algorithm (deprecated)',
    'Automated Seniority Assessment',
    'Web Development (deprecated)',
    'Practice Coding Challenge',
    'Automated Coding Challenge',
    'Product Manager',
    'Project Management',
    'Designer',
    'QA',
    'System Design',
    'Elasticsearch',
    'Team Collaboration',
]))


mcq_ = read_data_from_gbq(
f"""

with dev_mcq as (
  select
    dms.dev_id,
    dms.challenge_id,
    dsm.skill_id,
    bas4.skill_name,
    dms.dev_percentile,
    dms.last_updated_at,
    dc.challenge_name
  from 
    external_query("turing-230020.us.machine-learning",
      '''
      select * from prod.dev_mcq_score
      '''
    ) as dms
  left join devdb_mirror.dv2_skill_mcq as dsm
    on dms.challenge_id = dsm.mcq_id
  left join devdb_mirror.base_all_skills_v4 as bas4
    on dsm.skill_id = bas4.id
  left join devdb_mirror.dv2_challenge as dc
    on dms.challenge_id = dc.challenge_id
)

select * from dev_mcq
where dev_id in {unique_developers}
""")
mcq_.sort_values('last_updated_at', inplace=True)
mcq_ = mcq_[mcq_['challenge_name'].isin(drop_list)==False]




In [12]:
#INCLUDE HARDCODED SKILL -MCQ 
from utils.skill_utils import mcq_skill_id_mapping

In [13]:
dv2_work_experience_avg_score = read_data_from_gbq(
f"""
select grade_category_id,
avg_score, 
user_id as dev_id,
submit_time
from devdb_mirror.dv2_work_experience_avg_score d1
left join devdb_mirror.dv2_challenge_submit d2 
on d1.submit_id = d2.submit_id
where user_id in {unique_developers}
""")

dv2_work_experience_avg_score.sort_values('submit_time', inplace= True)
dv2_work_experience_avg_score.drop_duplicates(['dev_id','grade_category_id'], keep = 'last', inplace=True)
dv2_work_experience_avg_score

Unnamed: 0,grade_category_id,avg_score,dev_id,submit_time
11480,1,4.20000,318829,2020-07-12 10:42:32+00:00
11645,7,4.25000,318829,2020-07-12 10:42:32+00:00
11577,4,2.12500,318829,2020-07-12 10:42:32+00:00
11519,2,3.81250,318829,2020-07-12 10:42:32+00:00
11546,3,4.06667,318829,2020-07-12 10:42:32+00:00
...,...,...,...,...
9601,3,4.46667,259202,2022-04-20 11:27:33+00:00
9580,2,4.50000,259202,2022-04-20 11:27:33+00:00
9504,1,4.80000,259202,2022-04-20 11:27:33+00:00
9724,7,5.25000,259202,2022-04-20 11:27:33+00:00


In [14]:
skill_exp = read_data_from_gbq(
f"""
with skill_exp as (
SELECT
  t1.developer_id as dev_id,
  t1.skill_id,
  SUM(t1.project_years_of_experience) AS project_years_of_experience
FROM
  devdb_mirror.tpm_developer_skill_matchability t1
LEFT JOIN 
  devdb_mirror.tpm_developer_skill_matchability t2
ON 
  t1.developer_id = t2.developer_id
  AND t1.skill_id = t2.skill_id
  AND t1.updated_at < t2.updated_at
WHERE
  t2.updated_at IS NULL
  and t1.developer_id in {unique_developers}
GROUP BY 
  1, 2
)
select * from skill_exp
""")

In [15]:
tpm_developer_skill = read_data_from_gbq(
f"""
select developer_id as dev_id,
skill_id,
score, 
skill_level
from devdb_mirror.tpm_developer_skill
where developer_id in {unique_developers}
"""
)

In [16]:
mcq_.sort_values('last_updated_at',inplace=True)
mcq_.drop_duplicates(['dev_id', 'challenge_id'], keep='last', inplace=True)

In [17]:
tpm_dev_grading_result = read_data_from_gbq(
f'''
select * from devdb_mirror.tpm_dev_grading_result
where developer_id in {unique_developers}
''')

tpm_dev_grading_result.sort_values('updated_date', inplace=True)
temp =tpm_dev_grading_result[['developer_id', 'challenge_score']]
temp.drop_duplicates('developer_id', keep='last', inplace=True)
temp.columns = ['dev_id', 'ti_score']

mcq_ = pd.merge(mcq_, temp, on=['dev_id'], how='left')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.drop_duplicates('developer_id', keep='last', inplace=True)


In [18]:
developer_detail = read_data_from_gbq(
f'''
select user_id as dev_id,
hourly_rate,
years_of_experience,
calculated_years_of_experience,
github_url,
salary_in_USD, 
country_citizenship_id
from devdb_mirror.developer_detail
where user_id in {unique_developers}
''')

In [19]:
def create_mcq_features(df,
                       prefix):
    
    
    ret = {}
    
    #Other MCQ percentiles
    ret[prefix + '_dev_remaining_pct_mean'] = df['dev_percentile'].mean()
            
    return ret

In [20]:
mcq_ = mcq_[pd.isnull(mcq_['dev_percentile'])==False]

In [21]:
developer_detail.drop_duplicates('dev_id', inplace=True)
mcq_ = pd.merge(mcq_, developer_detail, on =['dev_id'], how='left')

# Load MCQ prediction model

These models are trained by Gultekin Gunduz and are used as the examples. In the future, if you happen to train a better one, feel free to replace it with your own model

In [23]:
import pickle

model_dir = "example_models"
with open(f'{model_dir}/mcq_prediction_model_fold0.pkl', 'rb') as f:
    m1 = pickle.load(f)
with open(f'{model_dir}/mcq_prediction_model_fold1.pkl', 'rb') as f:
    m2 = pickle.load(f)
with open(f'{model_dir}/mcq_prediction_model_fold2.pkl', 'rb') as f:
    m3 = pickle.load(f)
with open(f'{model_dir}/mcq_prediction_model_fold3.pkl', 'rb') as f:
    m4 = pickle.load(f)

This pkl file is taken from [gbdt_ranking_model](https://github.com/TuringEnterprises/gbdt_ranking_model)


In [25]:
job_skills = pd.read_pickle('./data/feature_set_march/job_skills_dict.pkl')
job_skills

{121: {'mh_skills': [120, 114, 2031],
  'mh_skills_mcqs': [128, 173, 176],
  'op_skills': []},
 176: {'mh_skills': [54], 'mh_skills_mcqs': [142], 'op_skills': []},
 204: {'mh_skills': [120, 132, 60, 2031],
  'mh_skills_mcqs': [176, 172, 173, 167],
  'op_skills': []},
 291: {'mh_skills': [122, 165, 358],
  'mh_skills_mcqs': [208, 156, 125],
  'op_skills': []},
 354: {'mh_skills': [2031], 'mh_skills_mcqs': [176], 'op_skills': []},
 409: {'mh_skills': [1623], 'mh_skills_mcqs': [], 'op_skills': []},
 438: {'mh_skills': [25], 'mh_skills_mcqs': [163], 'op_skills': []},
 445: {'mh_skills': [568], 'mh_skills_mcqs': [141], 'op_skills': [2067]},
 450: {'mh_skills': [387, 107, 284, 93, 30],
  'mh_skills_mcqs': [130, 229, 230],
  'op_skills': []},
 491: {'mh_skills': [122, 165], 'mh_skills_mcqs': [208, 125], 'op_skills': []},
 503: {'mh_skills': [158, 406, 2031],
  'mh_skills_mcqs': [152, 155, 176],
  'op_skills': []},
 523: {'mh_skills': [1561],
  'mh_skills_mcqs': [],
  'op_skills': [1625, 89, 2

In [26]:
mcq_ = mcq_[mcq_['dev_id'].isin(unique_dev_job_pairs['developer_id'].unique())]
mcq_['salary_in_USD'] = mcq_['salary_in_USD'].astype(np.float32)

In [27]:
unique_dev_job_pairs

Unnamed: 0,job_id,developer_id
0,800,113809
1,872,411413
2,1019,166359
3,1102,461881
4,1177,418617
...,...,...
594441,2402,461079
594442,2402,350397
594443,2402,491447
594493,2402,343410


### Prepare data to run inference with MCQ prediction models

In [31]:
from tqdm import tqdm
import time

results = []

for dev_id, dev_df in tqdm(unique_dev_job_pairs.groupby('developer_id')):
    
    dev_previous_mcqs = mcq_[mcq_['dev_id']==dev_id]
    
    if dev_previous_mcqs.shape[0]>0:
        d = dev_previous_mcqs.iloc[0]
            
    dev_skill_exp = skill_exp[skill_exp['dev_id']==dev_id]
    dev_tpm_developer_skill = tpm_developer_skill[tpm_developer_skill['dev_id']==dev_id]
    
    r = {}
    r['developer_id'] = dev_id
    
    
    for c in ['ti_score', 'hourly_rate', 'salary_in_USD']:

        r[c] = d[c]

    r.update(create_mcq_features(dev_previous_mcqs,
               prefix='earlier'))
            
    for job_id in dev_df['job_id'].unique():
        
        r['job_id'] = job_id


        cur_job_mcqs = job_skills[job_id]['mh_skills_mcqs']
        cur_job_skills = job_skills[job_id]['mh_skills']
        
        #query relaxation

        parsed_skills = []
        total = 0
        for cur_job_mcq in cur_job_mcqs:

            if cur_job_mcq in mcq_skill_id_mapping:
                skill_ids = mcq_skill_id_mapping[cur_job_mcq]
                parsed_skills = parsed_skills + skill_ids.tolist()

            r['num_skill'] = total

            dev_skill_yoe = dev_skill_exp[dev_skill_exp['skill_id'].isin(skill_ids)]
            dev_skill_selfDeclared_yoe = dev_tpm_developer_skill[dev_tpm_developer_skill['skill_id'].isin(skill_ids)]

            r['dev_skill_yoe'] = dev_skill_yoe['project_years_of_experience'].max()
            r['dev_selfDeclared_skill_level'] = dev_skill_selfDeclared_yoe['skill_level'].max()

            results.append(r.copy())

            total += 1

        cur_job_skills = [x for x in cur_job_skills if x not in parsed_skills]

        for cur_job_skill in cur_job_skills:

            r['num_skill'] = total
            skill_ids = [cur_job_skill]

            dev_skill_yoe = dev_skill_exp[dev_skill_exp['skill_id'].isin(skill_ids)]
            dev_skill_selfDeclared_yoe = dev_tpm_developer_skill[dev_tpm_developer_skill['skill_id'].isin(skill_ids)]

            r['dev_skill_yoe'] = dev_skill_yoe['project_years_of_experience'].max()
            r['dev_selfDeclared_skill_level'] = dev_skill_selfDeclared_yoe['skill_level'].max()

            results.append(r.copy())

            total += 1
        

100%|██████████████████████████████████████████████████████████████████████████████| 5651/5651 [26:10<00:00,  3.60it/s]


In [32]:
results = pd.DataFrame(results)
results

Unnamed: 0,developer_id,ti_score,hourly_rate,salary_in_USD,earlier_dev_remaining_pct_mean,job_id,num_skill,dev_skill_yoe,dev_selfDeclared_skill_level
0,126,7.0,30.0,30000.0,68.327753,5227,0,2.0,
1,126,7.0,30.0,30000.0,68.327753,5227,1,,
2,126,7.0,30.0,30000.0,68.327753,5227,2,1.2,
3,126,7.0,30.0,30000.0,68.327753,5165,0,3.2,2
4,126,7.0,30.0,30000.0,68.327753,5165,1,,
...,...,...,...,...,...,...,...,...,...
636795,2147743,6.0,25.0,,91.382591,6771,1,3.4,4
636796,2147743,6.0,25.0,,91.382591,6771,2,0.6,1
636797,2163831,9.0,24.0,120000.0,76.571598,6906,0,5.0,4
636798,2163831,9.0,24.0,120000.0,76.571598,6906,1,4.7,4


### Run MCQ prediction with prepared features

In [42]:
type(results["dev_selfDeclared_skill_level"][0]).__name__

'NAType'

In [58]:
def replace_with_nan(value):
    if type(value).__name__ == "NAType":
        return np.nan
    else:
        return value

In [60]:
# The NaN values of this column are declared as type NAType (don't know why), 
# so they have to be replaced with proper nan type, otherwise will throw an error when fed into the models
results["dev_selfDeclared_skill_level"] = results["dev_selfDeclared_skill_level"].apply(replace_with_nan)

In [47]:
type(results["dev_skill_yoe"][1])

numpy.float64

In [62]:
selected_features = ['ti_score', 'hourly_rate', 'salary_in_USD', 'dev_selfDeclared_skill_level', 'earlier_dev_remaining_pct_mean', 'dev_skill_yoe']

results['preds1'] = m1.predict(results[selected_features])
results['preds2'] = m2.predict(results[selected_features])
results['preds3'] = m3.predict(results[selected_features])
results['preds4'] = m4.predict(results[selected_features])

In [63]:
results

Unnamed: 0,developer_id,ti_score,hourly_rate,salary_in_USD,earlier_dev_remaining_pct_mean,job_id,num_skill,dev_skill_yoe,dev_selfDeclared_skill_level,preds1,preds2,preds3,preds4
0,126,7.0,30.0,30000.0,68.327753,5227,0,2.0,,41.036709,36.038709,35.910522,33.175238
1,126,7.0,30.0,30000.0,68.327753,5227,1,,,43.838632,54.908840,58.376667,48.360803
2,126,7.0,30.0,30000.0,68.327753,5227,2,1.2,,42.711068,39.284570,35.736748,32.716541
3,126,7.0,30.0,30000.0,68.327753,5165,0,3.2,2.0,56.916171,56.672226,55.537864,58.380056
4,126,7.0,30.0,30000.0,68.327753,5165,1,,,43.838632,54.908840,58.376667,48.360803
...,...,...,...,...,...,...,...,...,...,...,...,...,...
636795,2147743,6.0,25.0,,91.382591,6771,1,3.4,4.0,86.477206,79.235271,81.622029,80.414639
636796,2147743,6.0,25.0,,91.382591,6771,2,0.6,1.0,76.744290,76.820317,71.472922,73.418769
636797,2163831,9.0,24.0,120000.0,76.571598,6906,0,5.0,4.0,80.654250,80.421319,77.333379,75.199335
636798,2163831,9.0,24.0,120000.0,76.571598,6906,1,4.7,4.0,80.654250,80.421319,77.333379,75.199335


In [64]:
results['mean_preds'] = results[['preds1', 'preds2', 'preds3', 'preds4']].mean(axis=1)

In [65]:
predicted_mcqs = results.groupby(['developer_id', 'job_id'])['mean_preds'].agg(['mean', 'max', 'min', 'std']).reset_index()

In [66]:
#all-skill
predicted_mcqs.columns = ['developer_id', 'job_id', 'predicted_mcqs_mean', 'predicted_mcqs_max', 'predicted_mcqs_min', 'predicted_mcqs_std']
predicted_mcqs

Unnamed: 0,developer_id,job_id,predicted_mcqs_mean,predicted_mcqs_max,predicted_mcqs_min,predicted_mcqs_std
0,126,560,51.865188,52.853092,51.371236,0.855550
1,126,576,51.371236,51.371236,51.371236,0.000000
2,126,656,37.612232,37.612232,37.612232,
3,126,687,51.371236,51.371236,51.371236,
4,126,707,56.876580,56.876580,56.876580,
...,...,...,...,...,...,...
328861,2128729,1109,77.566271,80.927706,74.342844,2.329572
328862,2131498,6967,56.907966,56.907966,56.907966,0.000000
328863,2142105,6695,65.167147,72.167633,51.166175,12.125198
328864,2147743,6771,79.496216,81.937286,74.614074,4.228058


In [67]:
temp = pd.merge(df_train, predicted_mcqs, on = ['developer_id', 'job_id'], how='left')
temp

Unnamed: 0,job_id,job_created_date,accountid,leadsource,job_value,max_acceptable_rate,developer_id,mjm_create_ts,si_date,chosen_date,...,max_accept_rate_ratio_all,max_accept_rate_ratio_all_sq,hourly_rate_notna,hourly_rate_ratio,hourly_rate_ratio_sq,weight,predicted_mcqs_mean,predicted_mcqs_max,predicted_mcqs_min,predicted_mcqs_std
0,800,2020-08-13 23:31:30+00:00,0011U00001R4VfGQAV,1,14.7,50.0,113809,2020-08-14 17:44:23+00:00,NaT,NaT,...,1.428571,2.040816,1,0.340680,0.116063,0.200000,58.748373,76.086778,41.409968,20.020666
1,1485,2021-01-12 19:13:20+00:00,0011U00001afIOIQA2,0,7.4,45.0,527012,2021-02-20 21:53:42+00:00,NaT,NaT,...,1.285714,1.653061,1,0.910349,0.828735,0.005747,84.056805,84.056805,84.056805,
2,4857,2021-12-20 19:06:44+00:00,0014y00002TlVJ5AAN,0,22.2,35.0,1501403,2022-03-08 04:51:13+00:00,NaT,NaT,...,1.000000,1.000000,1,1.142857,1.306122,0.166667,71.045994,80.604985,67.648551,4.713002
3,4853,2021-12-20 16:32:55+00:00,0014y00002YjlcyAAB,0,23.4,,1388198,2021-12-23 19:51:15+00:00,NaT,NaT,...,,,1,,,0.333333,62.648115,66.697995,51.001712,5.770044
4,4822,2021-12-17 18:19:02+00:00,0014y00002Yj1jzAAB,0,23.4,35.0,1513092,2021-12-20 12:43:47+00:00,NaT,NaT,...,1.000000,1.000000,1,1.022857,1.046237,0.009524,73.380962,79.538804,67.203778,6.167536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325899,2008,2021-03-19 18:16:44+00:00,0011U00001iBpl6QAC,1,0.9,30.0,723815,2021-03-22 15:02:58+00:00,NaT,NaT,...,0.857143,0.734694,1,0.600000,0.360000,0.004902,70.487033,72.740284,68.233781,3.186579
325900,2008,2021-03-19 18:16:44+00:00,0011U00001iBpl6QAC,1,0.9,30.0,425171,2021-03-22 15:02:58+00:00,NaT,NaT,...,0.857143,0.734694,1,1.166667,1.361111,0.004902,51.557281,52.273704,50.840857,1.013175
325901,2008,2021-03-19 18:16:44+00:00,0011U00001iBpl6QAC,1,0.9,30.0,139557,2021-03-22 15:02:58+00:00,NaT,NaT,...,0.857143,0.734694,1,1.000000,1.000000,0.004902,56.042467,59.241178,52.843757,4.523659
325902,2008,2021-03-19 18:16:44+00:00,0011U00001iBpl6QAC,1,0.9,30.0,507704,2021-03-22 15:02:58+00:00,NaT,NaT,...,0.857143,0.734694,1,1.000000,1.000000,0.004902,77.867801,80.128725,75.606876,3.197430


In [68]:
temp.corr()['is_start'].sort_values().iloc[-25:]

dev_selfserve_si_count     0.025684
atleast_3yoe_ratio         0.026951
es_score                   0.027275
atleast_2yoe_ratio         0.028012
predicted_mcqs_mean        0.028012
avg_log_skill_yoe          0.029151
sum_relevant_proj_words    0.031015
relevant_mcq_take_rate     0.031093
num_mh_skills              0.031212
predicted_mcqs_max         0.032687
atleast1_relevant_mcq      0.033210
max_skill_yoe              0.035030
num_relevant_projs         0.035339
developer_id               0.036181
num_relevant_mcq           0.037281
final_score                0.039783
min_relevant_mcq_pct       0.040494
mean_relevant_mcq_pct      0.040524
max_relevant_mcq_pct       0.041959
is_shortlisted             0.325519
is_si                      0.493750
weight                     0.913209
is_start                   1.000000
atleast1_mcq                    NaN
hourly_rate_notna               NaN
Name: is_start, dtype: float64

In [69]:
temp[['predicted_mcqs_max', 'max_relevant_mcq_pct', 'predicted_mcqs_mean', 'mean_relevant_mcq_pct']].corr()

Unnamed: 0,predicted_mcqs_max,max_relevant_mcq_pct,predicted_mcqs_mean,mean_relevant_mcq_pct
predicted_mcqs_max,1.0,0.451617,0.937359,0.394369
max_relevant_mcq_pct,0.451617,1.0,0.410171,0.880826
predicted_mcqs_mean,0.937359,0.410171,1.0,0.420163
mean_relevant_mcq_pct,0.394369,0.880826,0.420163,1.0


In [70]:
import gc
def gbdt_train_func(train_set, features, train_label_col, sample_weight_col=None):

        
    train_data = lgb.Dataset(data=train_set[features], label=train_set[train_label_col],)

    model = lgb.train(train_set=train_data,
                        params=hyp2,
                        num_boost_round=hyp['num_boost_round'],
                        verbose_eval=False
                        )
                
 
    return model


def gbdt_predict_func(model, test_features):
    
    ypreds = np.array(model.predict(test_features))
    del model
    gc.collect()
    
    return ypreds

In [73]:
df_train.columns

Index(['job_id', 'job_created_date', 'accountid', 'leadsource', 'job_value',
       'max_acceptable_rate', 'developer_id', 'mjm_create_ts', 'si_date',
       'chosen_date', 'start_date', 'is_shortlisted', 'serve_type', 'is_si',
       'is_start', 'job_is_selfserve', 'atleast1_mcq', 'atleast1_relevant_mcq',
       'atleast_1yoe_ratio', 'atleast_2yoe_ratio', 'atleast_3yoe_ratio',
       'avg_gte3_ti_score', 'avg_log_skill_yoe', 'dsa_pct', 'final_score',
       'hourly_rate', 'interview_score', 'latest_gte3_ti_score',
       'latest_ti_score', 'len_resume_raw', 'max_mcq_pct',
       'max_relevant_mcq_pct', 'max_skill_yoe', 'max_ti_score', 'mean_mcq_pct',
       'mean_relevant_mcq_pct', 'min_gte3_ti_score', 'min_relevant_mcq_pct',
       'num_mcq', 'num_relevant_mcq', 'num_relevant_projs', 'num_ti_no_show',
       'num_ti_score', 'relevant_mcq_take_rate', 'seniority_avg',
       'starts_in_weeks', 'sum_relevant_proj_words', 'days_since_last_ti',
       'days_since_qualified', 'is_latam', '

In [77]:
mean = df_train[allfeats].mean()
df_train.fillna(mean, inplace=True)

for n, test_set_name in enumerate(dfs_test_sets):
    dfs_test_sets[test_set_name].fillna(mean, inplace=True)

In [78]:
import warnings
warnings.filterwarnings("ignore")

In [79]:
test_set_names

['packet2start',
 'packet2si',
 'ms_si2start',
 'ss_si2start',
 'ms_search2start',
 'ss_search2start']

### Train models with generated features

In [81]:
feats = ['mean_relevant_mcq_pct',
            'starts_in_weeks',
           'atleast_1yoe_ratio',
           'relevant_mcq_take_rate',
           'mean_mcq_pct',
           'num_relevant_projs',
           'seniority_avg',
           'num_relevant_mcq',
           'hourly_rate',
           'interview_score',
#            'num_vetted',
           'atleast1_relevant_mcq',
           'avg_gte3_ti_score']

for feat in feats:
    
    temp_feat = [x for x in feats if x != feat]
    print(temp_feat)
    
    hyp2 = {'boosting': 'gbdt',
               'objective': 'binary',
               'learning_rate': 0.05,
               'num_leaves': 13,
               'feature_fraction': 1.0,
               'bagging_fraction': 1.0,
               'max_depth': 7,
               'max_bin': 50,
               'min_child_samples': 100,
               'num_boost_round': 120,
               'monotone_constraints_method': 'advanced',
               'deterministic': 'true',
               'verbose': -1,
               'seed': 8}


    hyp = hyp2.copy()

    mono_cons = [mono_dict[x] if x in mono_dict.keys() else 0 for x in temp_feat] 
    hyp2['monotone_constraints'] = mono_cons


    metric_dicts, eval_dfs = full_ranking_evaluation(
    df_train,
    temp_feat,
    list(dfs_test_sets.values()),
    test_set_names,
    eval_labels_cols=test_label_cols,
    sample_weight_col=None,
    fix_dev_leak_cols=['developer_id'],
    train_func=gbdt_train_func,
    pred_func=gbdt_predict_func,
    verbose= True,
    )
    
    scores = pd.DataFrame(metric_dicts)
    curscore = scores['win_loss_top_10'].iloc[-2:].sum()
    print(curscore)

['starts_in_weeks', 'atleast_1yoe_ratio', 'relevant_mcq_take_rate', 'mean_mcq_pct', 'num_relevant_projs', 'seniority_avg', 'num_relevant_mcq', 'hourly_rate', 'interview_score', 'atleast1_relevant_mcq', 'avg_gte3_ti_score']


100%|████████████████████████████████████████████████████████████████████████████| 1304/1304 [2:02:56<00:00,  5.66s/it]


Unnamed: 0,test_sets,pairwise_ranking_score,raw_pairwise_ranking_score,win_loss_top_1,win_loss_top_3,win_loss_top_5,win_loss_top_10,interview_win_portion,interview_win_loss_1,interview_win_loss_0.5,...,interview_win_loss_0.2,avg_started_ranks,10pct_started_ranks,25pct_started_ranks,50pct_started_ranks,75pct_started_ranks,90pct_started_ranks,95pct_started_ranks,99pct_started_ranks,max_started_ranks
0,packet2start,0.583437,0.569742,0.388889,0.33642,0.329136,0.324093,0.730731,36.0,119.5,...,169.6,2.473498,1.0,1.0,2.0,3.0,4.0,6.0,12.0,20.0
1,packet2si,0.526863,0.514006,0.48652,0.4643,0.463668,0.461772,,,,...,,3.947638,1.0,2.0,3.0,5.0,8.0,11.0,20.0,37.0
2,ms_si2start,0.62054,0.595376,0.530435,0.407971,0.40442,0.402298,0.730654,51.0,146.0,...,203.0,1.929461,1.0,1.0,1.0,2.0,3.0,4.0,10.2,14.0
3,ss_si2start,0.544321,0.581776,0.361905,0.301587,0.291429,0.288175,0.584302,-71.0,17.5,...,70.6,2.698113,1.0,1.0,2.0,3.0,5.0,6.0,10.0,17.0
4,ms_search2start,0.753381,0.797571,0.132486,0.093769,0.084997,0.075073,0.719945,25.0,103.5,...,150.6,27.797872,1.0,3.0,10.0,32.0,73.7,103.65,223.37,362.0
5,ss_search2start,0.742146,0.770319,0.126464,0.091725,0.081011,0.068089,0.68105,-42.0,132.5,...,237.2,35.690149,1.0,3.0,10.0,36.0,89.0,161.6,363.96,911.0


0.1431620064050136
['mean_relevant_mcq_pct', 'atleast_1yoe_ratio', 'relevant_mcq_take_rate', 'mean_mcq_pct', 'num_relevant_projs', 'seniority_avg', 'num_relevant_mcq', 'hourly_rate', 'interview_score', 'atleast1_relevant_mcq', 'avg_gte3_ti_score']


  0%|▏                                                                              | 4/1304 [00:27<2:31:25,  6.99s/it]


KeyboardInterrupt: 

In [None]:
#!pip install optuna
import optuna




def objective(trial):
  
    feats = ['mean_relevant_mcq_pct',
            'starts_in_weeks',
           'atleast_1yoe_ratio',
           'relevant_mcq_take_rate',
           'mean_mcq_pct',
           'num_relevant_projs',
           'seniority_avg',
           'num_relevant_mcq',
           'hourly_rate',
           'interview_score',
#            'num_vetted',
           'atleast1_relevant_mcq',
           'avg_gte3_ti_score']

    mono_cons = [mono_dict[x] if x in mono_dict.keys() else 0 for x in feats] 
 
    global hyp2
    hyp2 = {'boosting': 'gbdt',
            'objective': 'binary',
            'learning_rate': 0.05,
            "num_leaves": trial.suggest_int("num_leaves", 13, 53),
            'feature_fraction': 1.0,
            'bagging_fraction': 1.0,
            "min_child_samples": trial.suggest_int("min_child_samples", 30, 200),
            "max_depth": trial.suggest_int("max_depth", 5, 11),
            "max_bin": trial.suggest_int("max_bin", 40, 80),
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "num_boost_round": trial.suggest_int("num_boost_round", 50, 300),
            #'monotone_constraints' : mono_cons,
            #'monotone_constraints_method': 'advanced',
            'deterministic': 'true',
            'verbose': -1,
           'seed': 8}
    
    hyp2['monotone_constraints'] = mono_cons

    print(hyp2)
    metric_dicts, eval_dfs = full_ranking_evaluation(
    df_train,
    feats,
    list(dfs_test_sets.values()),
    test_set_names,
    eval_labels_cols=test_label_cols,
    sample_weight_col=None,
    fix_dev_leak_cols=['developer_id'],
    train_func=gbdt_train_func,
    pred_func=gbdt_predict_func,
    verbose= True,
    )
    

    scores = pd.DataFrame(metric_dicts)
    score = scores['win_loss_top_10'].iloc[-2:].sum()
    
    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-06-02 16:28:55,899][0m A new study created in memory with name: no-name-7f36b5cf-caad-42ad-942f-0380ce66a86c[0m


{'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_leaves': 20, 'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'min_child_samples': 169, 'max_depth': 9, 'max_bin': 40, 'lambda_l1': 1.4952757309482632, 'lambda_l2': 0.0019638249745907474, 'num_boost_round': 70, 'deterministic': 'true', 'verbose': -1, 'seed': 8, 'monotone_constraints': [1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1]}


100%|████████████████████████████████████████████████████████████████████████████| 1304/1304 [1:55:28<00:00,  5.31s/it]


Unnamed: 0,test_sets,pairwise_ranking_score,raw_pairwise_ranking_score,win_loss_top_1,win_loss_top_3,win_loss_top_5,win_loss_top_10,interview_win_portion,interview_win_loss_1,interview_win_loss_0.5,...,interview_win_loss_0.2,avg_started_ranks,10pct_started_ranks,25pct_started_ranks,50pct_started_ranks,75pct_started_ranks,90pct_started_ranks,95pct_started_ranks,99pct_started_ranks,max_started_ranks
0,packet2start,0.59685,0.582618,0.414815,0.340123,0.328395,0.324464,0.742212,42.0,122.5,...,170.8,2.431095,1.0,1.0,2.0,3.0,4.0,6.0,11.18,21.0
1,packet2si,0.52066,0.515134,0.485924,0.461036,0.462322,0.462384,,,,...,,3.944223,1.0,2.0,3.0,5.0,8.0,10.0,20.44,37.0
2,ms_si2start,0.638497,0.60501,0.552174,0.407971,0.40442,0.402733,0.743499,57.0,149.0,...,204.2,1.908714,1.0,1.0,1.0,2.0,3.0,4.0,9.6,15.0
3,ss_si2start,0.515882,0.570093,0.304762,0.292063,0.293333,0.287222,0.555928,-74.0,16.0,...,70.0,2.745283,1.0,1.0,2.0,3.0,5.0,6.0,10.95,16.0
4,ms_search2start,0.758554,0.801908,0.132486,0.098609,0.090442,0.075255,0.729291,28.0,105.0,...,151.2,27.224291,1.0,3.0,9.5,31.0,72.7,103.7,225.74,372.0
5,ss_search2start,0.745239,0.777886,0.120609,0.092116,0.084055,0.067724,0.676236,-37.0,135.0,...,238.2,34.54811,1.0,3.0,10.0,34.0,83.0,149.4,368.56,733.0


[32m[I 2022-06-02 18:24:30,937][0m Trial 0 finished with value: 0.14297919588332592 and parameters: {'num_leaves': 20, 'min_child_samples': 169, 'max_depth': 9, 'max_bin': 40, 'lambda_l1': 1.4952757309482632, 'lambda_l2': 0.0019638249745907474, 'num_boost_round': 70}. Best is trial 0 with value: 0.14297919588332592.[0m


{'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_leaves': 21, 'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'min_child_samples': 200, 'max_depth': 8, 'max_bin': 44, 'lambda_l1': 0.04446056442066039, 'lambda_l2': 0.019975699914073178, 'num_boost_round': 114, 'deterministic': 'true', 'verbose': -1, 'seed': 8, 'monotone_constraints': [1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1]}


 29%|██████████████████████▍                                                      | 381/1304 [47:31<3:46:52, 14.75s/it]

In [None]:
pd.DataFrame(metric_dicts)['pairwise_ranking_score'].sum()