# 字节跳动岗位分析部分

> 20190105 曾一凡
>
> 爬虫大作业
>
> 2022年6月23日

引入画图库，在 Linux 环境下直接 `pip install lets-plot` 可以安装，详情请参考 `https://lets-plot.org/`

In [None]:
from lets_plot import *
from lets_plot.geo_data import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html(offline=True)

## 测试数据库读取

本次数据全部在 `MongoDB` 数据库中进行存储，以及利用 `MongoDB` 自带的数据处理功能进行部分预处理

In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np

读取一条进行测试

In [None]:
client = MongoClient('localhost', 27017)
client['jobspider_raw']['bytedance'].find_one()

## 岗位类型及就职要求分析

### 数据预处理

去处 `categories` 为 `None` 的工作，并对 `categories` 值进行拆分，提取大类与子类字段。

In [None]:
result = client['jobspider_raw']['bytedance'].aggregate([
    {
        '$match': {
            '$expr': {
                '$and': [
                    {
                        '$ne': [
                            '', '$requirement'
                        ]
                    }, {
                        '$ne': [
                            '', '$description'
                        ]
                    }, {
                        '$ne': [
                            None, '$job_category'
                        ]
                    }, {
                        '$ne': [
                            None, '$city_list'
                        ]
                    }
                ]
            }
        }
    }, {
        '$project': {
            'id': 1, 
            'title': 1, 
            'description': 1,
            'requirement': 1,
            'categories_l2': '$job_category.name', 
            'categories_l1': '$job_category.parent.name', 
            'job_type': '$recruit_type.name', 
            'recruit_type': '$recruit_type.parent.name', 
            'job_hot_flag': 1, 
            'workLocations_tmp': {
                '$arrayElemAt': [
                    '$city_list', 0
                ]
            }
        }
    }, {
        '$addFields': {
            'workLocations': '$workLocations_tmp.name'
        }
    }, {
        '$unset': 'workLocations_tmp'
    }
])

In [None]:
categories_df = pd.DataFrame(list(result))
categories_df.head()

In [None]:
for row in categories_df.itertuples():
    if pd.isnull(row.categories_l1):
        categories_df.iloc[row.Index, categories_df.columns.get_loc('categories_l1')] = row.categories_l2

### 职位类型分析

In [None]:
ggplot(categories_df) + \
geom_bar(aes(x=as_discrete('categories_l1', order_by='..count..'),fill=as_discrete('categories_l2', order_by='..count..')),
         size=0.5, show_legend=False) + \
ggsize(1200,700) + ggtitle("字节跳动职位类型")

### 职位招聘类型分析

可以看出全部网站上列出的职位均为社招

In [None]:
ggplot(categories_df) + \
geom_bar(aes(x='recruit_type', fill=as_discrete('categories_l1', order_by='..count..')),
         size=0.5, position='dodge', show_legend=True) + \
ggsize(600,300) + ggtitle("字节跳动职位招聘类型分布")

可以看出大部分职位都是正式，少量外包和第三方派遣

In [None]:
ggplot(categories_df) + \
geom_bar(aes(x='job_type', fill=as_discrete('categories_l1', order_by='..count..')),
         size=0.5, position='dodge', show_legend=True) + \
ggsize(600,300) + ggtitle("字节跳动职位任职类型要求")

## 工作地点及部门分析

In [None]:
locations_df = categories_df.groupby(["workLocations", "categories_l1"]).count()
locations_df.drop(index = ["斯德哥尔摩", "宜宾"], inplace=True)

In [None]:
raw_city_list = [i[0] for i in locations_df.index]

# 处理直接使用中文搜索不到的地名
english_dic = {"中国台湾": "Taiwan", "中国澳门": "Macao",
               "中国香港": "Hongkong", "伦敦": "London", 
               "华沙": "Warsaw", "圣地亚哥": "San Diego",
               "圣彼得堡": "Saint Petersburg", "米兰": "Milan",
               "卡萨布兰卡": "Casablanca", "古尔冈": "Gurugram",
               "圣保罗": "São Paulo", "好莱坞": "West Hollywood",
               "约翰内斯堡": "Johannesburg", "西雅图": "Seattle"}

# 处理有同名县城的城市名
city_dic = {"中山": "中山市", "乌鲁木齐": "乌鲁木齐市",
            "九江": "九江市", "南京": "南京市",
            "南昌": "南昌市", "南通": "南通市",
            "大连": "大连市", "岳阳": "岳阳市",
            "新乡": "新乡市", "江门": "江门市",
            "洛阳": "洛阳市", "淮安": "淮安市",
            "眉山": "眉山市", "石家庄": "石家庄市",
            "南平": "南平市","南阳": "南阳市","唐山": "唐山市",
            "安庆": "安庆市","宜宾": "宜宾市","昆山": "昆山市",
            "桂林": "桂林市","泰安": "泰安市","湘潭": "湘潭市",
            "濮阳": "濮阳市","红河": "蒙自市","舟山": "舟山市",
            "资阳": "资阳市","镇江": "镇江市","长春": "长春市",
            "阳江": "阳江市"}

process_city_list = list()
for i in raw_city_list:
    if i in english_dic.keys():
        process_city_list.append(english_dic[i])
    elif i in city_dic.keys():
        process_city_list.append(city_dic[i])
    else:
        process_city_list.append(i)

centroids = geocode_cities(process_city_list).get_centroids()

In [None]:
locations_df["city"] = [i[0] for i in locations_df.index]
locations_df["categories_l1"] = [i[1] for i in locations_df.index]

In [None]:
ggplot() + geom_livemap(zoom=5) + \
geom_point(aes(size='_id', color="categories_l1"), alpha=0.9, \
           data=locations_df, map=centroids, map_join='city', \
           tooltips=layer_tooltips().line('@city')
                                    .line('Job Count|@_id')
                                    .line('Department|@categories_l1')) + \
scale_color_brewer(type='qual', palette='Set1') + ggsize(1000, 600)

## 工作内容及岗位要求描述语言分析

### 按词性分析总体情况

In [None]:
import jieba
import jieba.posseg as pseg
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm
import paddle
paddle.enable_static()
jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持，早期版本不支持

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
import matplotlib.font_manager as font_manager

# Add every font at the specified location
font_dir = ['fonts']
for font in font_manager.findSystemFonts(font_dir):
    font_manager.fontManager.addfont(font)
    
plt.rcParams['font.family'] = 'FZLanTingHeiS-R-GB'

from collections import OrderedDict

In [None]:
requirement_seg_list = []
description_seg_list = []
for i in tqdm(categories_df.iterrows(), total=len(categories_df)):
    requirement_seg_list += pseg.cut(i[1]['requirement'], use_paddle=True) # 使用paddle模式
    description_seg_list += pseg.cut(i[1]['description'], use_paddle=True) # 使用paddle模式

In [None]:
word_dict = {}
word_dict["requirement"] = OrderedDict()
word_dict["description"] = OrderedDict()
word_pseg_df = pd.read_csv("word_pseg.csv")
for p in word_pseg_df["pseg"]:
    word_dict["requirement"][p] = []
    word_dict["description"][p] = []
for i in requirement_seg_list:
    try:
        word_dict["requirement"][i.flag].append(i.word)
    except KeyError:
        pass
for i in description_seg_list:
    try:
        word_dict["description"][i.flag].append(i.word)
    except KeyError:
        pass
word_stat = []
for k, v in word_dict.items():
    for p, wl in v.items():
        word_stat.append({"source": k, "pseg": p, "count": len(wl)})
        
word_stat_df = pd.DataFrame(word_stat)
word_stat_df = word_stat_df.merge(word_pseg_df, on='pseg', how='left')
word_stat_df.head()

In [None]:
ggplot(word_stat_df, aes(x='desc', y='count', fill='source')) + \
geom_bar(stat='identity', position='dodge', tooltips=layer_tooltips().line('@pseg').line('@desc').line('@')) + \
ggsize(1200,700)

In [None]:
wc = WordCloud(
    background_color='white', #设置背景颜色
    width = 1000, height = 500,
    max_words = 500, #设置最大显示的词数
    stopwords = STOPWORDS, #设置停用词
    font_path = 'fonts/FZLTHJW.TTF', 
    max_font_size = 100, #设置字体最大值
    random_state=50, #设置随机生成状态，即多少种配色方案
    )

In [None]:
# plt.rcParams['figure.figsize'] = (10.0, 1000.0)
fig, ax = plt.subplots(28, 2, figsize=(20, 170))
for id_part, (part_name, pseg_dict) in enumerate(word_dict.items()):
    for id_pseg, (p, wl) in tqdm(enumerate(pseg_dict.items()), total=28):
        requirement_wordcloud = wc.generate(' '.join(wl))
        ax[id_pseg][id_part].imshow(requirement_wordcloud)
        ax[id_pseg][id_part].axis('off')
        ax[id_pseg][id_part].set_title(f"{part_name} {word_pseg_df[word_pseg_df['pseg']==p]['desc'].values[0]}")

### 分析不同类别的工作描述与工作要求

In [None]:
cat_val = list(categories_df.groupby(["categories_l1"])["_id"].count().sort_values(ascending=False).index)
seg_cat_dic = {"requirement": OrderedDict(), "description": OrderedDict()}
for i in seg_cat_dic.keys():
    for j in cat_val:
        seg_cat_dic[i][j] = []

for i in tqdm(categories_df.iterrows(), total=len(categories_df)):
    seg_cat_dic['requirement'][i[1]['categories_l1']] += pseg.cut(i[1]['requirement'], use_paddle=True)
    seg_cat_dic['description'][i[1]['categories_l1']] += pseg.cut(i[1]['description'], use_paddle=True)

In [None]:
word_dict = {"requirement": OrderedDict(), "description": OrderedDict()}
care_pseg = ["n", "nz", "v", "vn", "a"]
for i in word_dict.keys():
    for j in cat_val:
        word_dict[i][j] = OrderedDict()
        for k in care_pseg:
            word_dict[i][j][k] = []

for part, content in seg_cat_dic.items():
    for cat, wl in content.items():
        for i in wl:
            try:
                word_dict[part][cat][i.flag].append(i.word)
            except KeyError:
                pass

In [None]:
wc = WordCloud(
    background_color='white', #设置背景颜色
    width = 700, height = 700,
    max_words = 500, #设置最大显示的词数
    stopwords = STOPWORDS, #设置停用词
    font_path = 'fonts/FZLTHJW.TTF', 
    max_font_size = 100, #设置字体最大值
    random_state=50, #设置随机生成状态，即多少种配色方案
    )

In [None]:
for id_part, (part_name, cat_dict) in enumerate(word_dict.items()):
    fig, ax = plt.subplots(len(cat_val), len(care_pseg), figsize=(23, 40))
    for id_cat, (cat_name, pseg_dict) in tqdm(enumerate(cat_dict.items()), total=len(cat_val)):
        for id_pseg, (p, wl) in enumerate(pseg_dict.items()):
            ax[id_cat][id_pseg].axis('off')
            ax[id_cat][id_pseg].set_title(f"{part_name} {cat_name} {word_pseg_df[word_pseg_df['pseg']==p]['desc'].values[0]}")
            if not len(wl):
                continue
            requirement_wordcloud = wc.generate(' '.join(wl))
            ax[id_cat][id_pseg].imshow(requirement_wordcloud)