In [1]:
import pandas as pd
import numpy as np

train_path = "./dataset/dataset-train.inter" 
valid_path = "./dataset/dataset-valid.inter"
test_path = "./dataset/dataset-test.inter"
udoc_path = "./dataset/zhilian.udoc"
idoc_path = "./dataset/zhilian.idoc"

train_interact_df = pd.read_csv(train_path, sep="\t")
valid_interact_df = pd.read_csv(valid_path, sep="\t")
test_interact_df = pd.read_csv(test_path, sep="\t")

udoc_df = pd.read_csv(udoc_path, sep="\t")
idoc_df = pd.read_csv(idoc_path, sep="\t", on_bad_lines='warn')

In [2]:
train_df = train_interact_df.merge(idoc_df, on="job_id:token", how="inner").merge(udoc_df, on="user_id:token", how="inner")
valid_df = valid_interact_df.merge(idoc_df, on="job_id:token", how="inner").merge(udoc_df, on="user_id:token", how="inner")
test_df = test_interact_df.merge(idoc_df, on="job_id:token", how="inner").merge(udoc_df, on="user_id:token", how="inner")
# str(train_df.iloc[[0]].to_dict(orient="index")[0]['cur_jd_type:token_seq'])

display(train_df.describe())
display(valid_df.describe())
display(test_df.describe())

Unnamed: 0,browsed:label,delivered:label,satisfied:label
count,511515.0,511515.0,511515.0
mean,0.216297,0.094996,0.044278
std,0.411719,0.29321,0.205713
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


Unnamed: 0,browsed:label,delivered:label,satisfied:label
count,63950.0,63950.0,63950.0
mean,0.214167,0.093917,0.043096
std,0.410247,0.291716,0.203075
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


Unnamed: 0,browsed:label,delivered:label,satisfied:label
count,63941.0,63941.0,63941.0
mean,0.212821,0.093258,0.044056
std,0.409305,0.290796,0.205222
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [4]:
# browsed:label, delivered:label, satisfied:label pos(1), neg(0) sample ratio
from typing import List
def pos_neg_sample_ratio(df, label_fields: List[str]):
    """
    df: one of train_df, valid_df, test_df;
    label_field: one or more of ["browsed:label", "delivered:label", "satisfied:label"]
    return:
    a dataframe with pos/neg sample ratio for each label_field
    """
    ans = dict()
    for label_field in label_fields:
        pos = df[label_field].sum()
        neg = len(df) - pos
        ans[label_field] = {
            "pos/neg": pos / neg,
            "pos/total": pos / df.shape[0],
            "neg/total": neg / df.shape[0]
        }
    ans = df.from_dict(ans, orient="index")
    return ans

print("train_df pos/neg sample ratio:\n", pos_neg_sample_ratio(train_df, ["browsed:label", "delivered:label", "satisfied:label"]))
print("valid_df pos/neg sample ratio:\n", pos_neg_sample_ratio(valid_df, ["browsed:label", "delivered:label", "satisfied:label"]))
print("test_df pos/neg sample ratio:\n", pos_neg_sample_ratio(test_df, ["browsed:label", "delivered:label", "satisfied:label"]))

train_df pos/neg sample ratio:
                   pos/neg  pos/total  neg/total
browsed:label    0.275993   0.216297   0.783703
delivered:label  0.104968   0.094996   0.905004
satisfied:label  0.046330   0.044278   0.955722
valid_df pos/neg sample ratio:
                   pos/neg  pos/total  neg/total
browsed:label    0.272536   0.214167   0.785833
delivered:label  0.103652   0.093917   0.906083
satisfied:label  0.045037   0.043096   0.956904
test_df pos/neg sample ratio:
                   pos/neg  pos/total  neg/total
browsed:label    0.270359   0.212821   0.787179
delivered:label  0.102849   0.093258   0.906742
satisfied:label  0.046087   0.044056   0.955944


In [31]:
# for a user, how many jobs he/she has browsed/delivered/satisfied.
user_funnel_train = train_df.groupby("user_id:token")[['browsed:label', 'delivered:label', 'satisfied:label']].sum()
user_funnel_valid = valid_df.groupby("user_id:token")[['browsed:label', 'delivered:label', 'satisfied:label']].sum()
user_funnel_test = test_df.groupby("user_id:token")[['browsed:label', 'delivered:label', 'satisfied:label']].sum()

# display(user_funnel_train.head(10))
# display(user_funnel_valid.head(10))
# display(user_funnel_test.head(10))

# for a job, how many users is satisfied by the hr?
job_funnel_train = train_df.groupby("job_id:token")[['satisfied:label']].sum()
job_funnel_valid = valid_df.groupby("job_id:token")[['satisfied:label']].sum()
job_funnel_test = test_df.groupby("job_id:token")[['satisfied:label']].sum()


job_funnel_train['satisfied:label'].unique()
# display(job_funnel_train)
# display(job_funnel_valid)
# display(job_funnel_test)

array([ 0,  1,  3,  6,  4,  2,  5,  7, 15, 12, 11,  9, 10, 20, 16,  8, 13,
       14, 17, 24, 19])

In [17]:
def gen_udoc_idoc_label(df, job_field='job_doc:token_seq'):
    """
    Generate cv and jd text representations from dataframe
    
    Args:
        df: Input dataframe
        job_field: Column name containing job description
    
    Returns:
        DataFrame with cv, jd and label columns
    """
    new_df = df.copy()
    
    def gen_cv(row):
        cv_parts = ["个人简历: "]
        
        # Field mappings with their headers
        field_mappings = {
            "experience:token_seq": "【工作经历】 ",
            "desire_jd_industry_id:token_seq": "【期望行业】: ",
            "cur_industry_id:token_seq": "【当前行业】",
            "jd_type_id:token_seq": "【职位标签:】"
        }
        
        # Add each field with its header
        for field, header in field_mappings.items():
            if field in row:
                cv_parts.append(header)
                cv_parts.append(str(row[field]) if pd.notna(row[field]) else "无 ")
        
        return " ".join(cv_parts)
    
    def gen_jd(row):
        return str(row[job_field]) if pd.notna(row[job_field]) else ""
    
    # Generate cv and jd columns
    new_df['cv'] = new_df.apply(gen_cv, axis=1)
    new_df['jd'] = new_df.apply(gen_jd, axis=1)
    
    # Return relevant columns
    label_fields = ["browsed:label", "delivered:label", "satisfied:label"]
    return new_df[["cv", "jd"] + label_fields]
processed_df_train = gen_udoc_idoc_label(train_df)
processed_df_valid = gen_udoc_idoc_label(valid_df)
processed_df_test = gen_udoc_idoc_label(test_df)

In [None]:
display(idoc_df.isnull().mean())
display(udoc_df.isnull().mean())

job_id:token         0.0
job_doc:token_seq    0.0
dtype: float64

user_id:token                      0.000000
desire_jd_industry_id:token_seq    0.000000
desire_jd_type_id:token_seq        0.000000
cur_industry_id:token_seq          0.004253
cur_jd_type:token_seq              0.376539
experience:token_seq               0.000000
dtype: float64

In [5]:
train_interact_df.merge(idoc_df, on="job_id:token", how="left").isnull().mean()

user_id:token        0.00000
job_id:token         0.00000
browsed:label        0.00000
delivered:label      0.00000
satisfied:label      0.00000
job_doc:token_seq    0.00907
dtype: float64

In [6]:
train_interact_df.merge(udoc_df, on="user_id:token", how="left").isnull().mean()

user_id:token                      0.000000
job_id:token                       0.000000
browsed:label                      0.000000
delivered:label                    0.000000
satisfied:label                    0.000000
desire_jd_industry_id:token_seq    0.000000
desire_jd_type_id:token_seq        0.000000
cur_industry_id:token_seq          0.003464
cur_jd_type:token_seq              0.370341
experience:token_seq               0.000000
dtype: float64

In [7]:
pd.merge(train_interact_df, idoc_df, on="job_id:token", how="left")

Unnamed: 0,user_id:token,job_id:token,browsed:label,delivered:label,satisfied:label,job_doc:token_seq
0,125f691de44821d5d170b0ef3f59de57,ed2a4741f3254dfc6020b0f0be715927,0,0,0,1、法学专业本科；2、能够熟练操作办公软件；草拟法律文书，资料整理、数据汇总3、工作踏实勤奋...
1,25c0d95add228a30a6453448b6e2b296,f99825d37c3c7d853b0f4f31264d0b25,0,0,0,1、所负责区域内教育主管单位及学校的关系维护；2、基于产品及用户开展培训及运营工作（拓校、活...
2,05f5d36ed65cc7204ac8a6edfa6df36f,3ee1f23ca787d127fc86ee5bd8a3ace3,0,0,0,岗位职责:1、负责经销商及经销商业务人员的管理与培训； 2、负责公司各项销售活动在终端的推...
3,0849ae1528bd1f22c0e32d6a3469cec7,3e2e7dd3c55a9e5799b8865c04896135,0,0,0,1.20-25岁，160cm以上，性格开朗随和，有亲和力，衣着整齐大方得体。2.普通话标准。...
4,07068262c856c973916a6401d0a7669e,addde24d60b96765233eece119d4e8d0,0,0,0,岗位职责：1.负责部门文档（资料、档案等）的收集、整理、归档、借阅、盘点等工作；2.负责将档...
...,...,...,...,...,...,...
516192,005ee1b3219a6e71301a7d8ec9399d3f,d0429566419c44b72141b8b4daa9ec05,0,0,0,岗位职责:1、根据市场情况合理分配目标，完成公司下达的销售目标2、根据公司的市场发展策略，制...
516193,0ac95ab0a4278eddda27abdb0c5ea5ca,9febf46d62766a53609dae78d8dc02b6,0,0,0,深圳市盐田区政府行政服务大厅招聘:22名窗口服务人员，从事盐田辖区行政服务大厅窗口工作，具体...
516194,0247bbd12fb01609d049abe3d818f604,d8591738135318317be0b2a8f8efee53,1,0,0,岗位职责简述：1、负责编制项目《工程策划报告》，布置现场总平面，工程施工总体进度计划；2、负...
516195,16505bedd29a4acfdbe53736ffdd08ff,1958b10e859b753b71d111dff568730a,0,0,0,岗位职责： 1.经销商管理：业务沟通和谈判，销售活动组织，供应链管理2.销售目标达成：达成公...


In [9]:
train_df.isnull().mean()

user_id:token                      0.000000
job_id:token                       0.000000
browsed:label                      0.000000
delivered:label                    0.000000
satisfied:label                    0.000000
job_doc:token_seq                  0.000000
desire_jd_industry_id:token_seq    0.000000
desire_jd_type_id:token_seq        0.000000
cur_industry_id:token_seq          0.003478
cur_jd_type:token_seq              0.370478
experience:token_seq               0.000000
dtype: float64

In [10]:
display(train_df[["browsed:label", "delivered:label", "satisfied:label"]].mean())
train_df.iloc[[0]]

browsed:label      0.216297
delivered:label    0.094996
satisfied:label    0.044278
dtype: float64

Unnamed: 0,user_id:token,job_id:token,browsed:label,delivered:label,satisfied:label,job_doc:token_seq,desire_jd_industry_id:token_seq,desire_jd_type_id:token_seq,cur_industry_id:token_seq,cur_jd_type:token_seq,experience:token_seq
0,125f691de44821d5d170b0ef3f59de57,ed2a4741f3254dfc6020b0f0be715927,0,0,0,1、法学专业本科；2、能够熟练操作办公软件；草拟法律文书，资料整理、数据汇总3、工作踏实勤奋...,"专业服务/咨询(财会/法律/人力资源等),保险,教育/培训/院校",兼职,保险,,法务|调查|自我评价|财险|合规|律师


{'user_id:token': '07552f602eeeff09f20db4bc3c32f398',
 'job_id:token': 'ea71367aaaf79e0057ea7f804b76f142',
 'browsed:label': 1,
 'delivered:label': 1,
 'satisfied:label': 0,
 'job_doc:token_seq': '工作职责1.协助培训经理进行培训需求调研，培训课程的开发及运营标准化建设。2.根据公司总部培训计划，实施影院部及各影城单店的培训授课。3.及时对培训效果进行测评及追踪，出具培训效果评估报告。4.负责企业培训文化氛围的建设与完善，宣导企业文化、价值观。5.完成领导交办的其他工作。任职条件1.酒店管理、工商管理等专业专科以上学历。2.具有三年以上影院运营相关工作经验，有大型连锁影院行业培训工作经验优先。3.熟悉影院各岗位培训管理。4.具有良好的职业道德，踏实稳重，责任心强，有较强的沟通能力。',
 'desire_jd_industry_id:token_seq': '保险,教育/培训/院校,其他',
 'desire_jd_type_id:token_seq': '培训经理/主管,企业培训师/讲师',
 'cur_industry_id:token_seq': '教育/培训/院校',
 'cur_jd_type:token_seq': '教育/培训',
 'experience:token_seq': '教育规划|立项|体系|银行|辅导|计算|教育培训|演讲|教材|基金|教学|安排|领导|线上|行销|协助|企业|编辑|金融机构|制度|销售|推进|维护|电话|档案|挖掘|需求|反馈|调查|国家|采购|沟通|客户关系|资料整理|研究员|理财师|包装|人事主管|班主任|完善|教育|计划|综合|机构|讲师|员工|处理|录制|人寿|儿童|组织|责任心|考核|视频|投资理财|运作|建行|方案|课件|设计|保险公司|主题|学员|爱心|消费|内训|培训班|资料|商学院|业务|业务部门|金融|主管|理财|配置|实施|定制|后期|课程|内容|客户|培训|规划|市场营销|校园|课程体系|撰写|流程|养老|制作|编制|亲和力|资产|产品|员工关系|评估|总经理助理'}

In [18]:
train_df[["cur_jd_type:token_seq"]]

Unnamed: 0,cur_jd_type:token_seq
0,
1,
2,房地产开发/经纪/中介
3,
4,医院/医疗/护理
...,...
511510,土木/建筑/装修/市政工程
511511,公关/媒介
511512,房地产开发/经纪/中介
511513,销售管理


In [19]:
# load processed
processed_train = pd.read_csv("./dataset/processed_train.csv")
processed_valid = pd.read_csv("./dataset/processed_valid.csv")
processed_test = pd.read_csv("./dataset/processed_test.csv")

processed_train

Unnamed: 0,cv,jd,browsed:label,delivered:label,satisfied:label
0,个人简历: 【工作经历】 法务|调查|自我评价|财险|合规|律师 【期望行业】: 专业...,1、法学专业本科；2、能够熟练操作办公软件；草拟法律文书，资料整理、数据汇总3、工作踏实勤奋...,0,0,0
1,个人简历: 【工作经历】 讲解|英语老师|行政|辅导|生产|检查|办公室主任|教育培训|...,1、所负责区域内教育主管单位及学校的关系维护；2、基于产品及用户开展培训及运营工作（拓校、活...,0,0,0
2,个人简历: 【工作经历】 计划|预算|综合|银行|按揭|处理|office|接待|房地产...,岗位职责:1、负责经销商及经销商业务人员的管理与培训； 2、负责公司各项销售活动在终端的推...,0,0,0
3,个人简历: 【工作经历】 活动|教师资格|普通话|外国语言文学|教师|辅导|老师|前台|...,1.20-25岁，160cm以上，性格开朗随和，有亲和力，衣着整齐大方得体。2.普通话标准。...,0,0,0
4,个人简历: 【工作经历】 档案管理|事务管理|接待|组织|保证|公文|促销|协调|药房|...,岗位职责：1.负责部门文档（资料、档案等）的收集、整理、归档、借阅、盘点等工作；2.负责将档...,0,0,0
...,...,...,...,...,...
511510,个人简历: 【工作经历】 互联网参考模型osi七层|现场|施工|材料|cad|项目预算|...,岗位职责:1、根据市场情况合理分配目标，完成公司下达的销售目标2、根据公司的市场发展策略，制...,0,0,0
511511,个人简历: 【工作经历】 赛事|图片|现场|传播|脚本撰写|机构|导师|完美|处理|媒体...,深圳市盐田区政府行政服务大厅招聘:22名窗口服务人员，从事盐田辖区行政服务大厅窗口工作，具体...,0,0,0
511512,个人简历: 【工作经历】 体系|模式|建设|开发流程|安置|设计规划|规划设计|经理|乡...,岗位职责简述：1、负责编制项目《工程策划报告》，布置现场总平面，工程施工总体进度计划；2、负...,1,0,0
511513,个人简历: 【工作经历】 计划|互联网参考模型osi七层|安排|计算机等级|店长|人力|...,岗位职责： 1.经销商管理：业务沟通和谈判，销售活动组织，供应链管理2.销售目标达成：达成公...,0,0,0


In [21]:
display(processed_train.isnull().mean())
display(processed_valid.isnull().mean())
display(processed_test.isnull().mean())

cv                 0.0
jd                 0.0
browsed:label      0.0
delivered:label    0.0
satisfied:label    0.0
dtype: float64

cv                 0.0
jd                 0.0
browsed:label      0.0
delivered:label    0.0
satisfied:label    0.0
dtype: float64

cv                 0.0
jd                 0.0
browsed:label      0.0
delivered:label    0.0
satisfied:label    0.0
dtype: float64