# 腾讯岗位分析部分

> 20190105 曾一凡
>
> 爬虫大作业
>
> 2022年6月25日

引入画图库，在 Linux 环境下直接 `pip install lets-plot` 可以安装，详情请参考 `https://lets-plot.org/`

In [None]:
from lets_plot import *
from lets_plot.geo_data import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html(offline=True)

## 测试数据库读取

本次数据全部在 `MongoDB` 数据库中进行存储，以及利用 `MongoDB` 自带的数据处理功能进行部分预处理

In [None]:
from pymongo import MongoClient
import pandas as pd

读取一条进行测试

In [None]:
client = MongoClient('localhost', 27017)
client['jobspider_raw']['tencent'].find_one()

## 岗位类型及就职要求分析

### 数据预处理

去处 `categories` 为 `None` 的工作，并对 `categories` 值进行拆分，提取大类与子类字段。

In [None]:
result = client['jobspider_raw']['tencent'].aggregate([
    {
        '$match': {
            '$expr': {
                '$and': [
                    {'$ne': ["", '$BGName']},
                    {'$ne': ["", '$Responsibility']},
                    {'$ne': ["", '$CategoryName']}
                ]
            }
        }
    }, {
        '$project': {
            'Id': 1, 
            'CategoryName': 1,
            'BGName': 1,
            'Responsibility': 1,
            'RecruitPostName': 1
        }
    }
])

In [None]:
categories_df = pd.DataFrame(list(result))
categories_df.head()

### 职位类型分析

腾讯事业群简写对照：

- CDG 企业发展事业群
- CSIG 云与智慧产业事业群
- IEG 互动娱乐事业群
- PCG 平台与内容事业群
- WXG 微信事业群
- TEG 技术工程事业群
- S1 职能系统－职能线
- S2 职能系统－财经线
- S3 职能系统－HR与管理线
- TME 腾讯音乐

In [None]:
ggplot(categories_df) + \
geom_bar(aes(x=as_discrete('BGName', order_by='..count..'),fill=as_discrete('CategoryName', order_by='..count..')),
         size=0.5, show_legend=False) + \
ggsize(1200,700) + ggtitle("腾讯职位类型")

## 工作地点及部门分析

In [None]:
result = client['jobspider_raw']['tencent'].aggregate([
    {
        '$match': {
            '$expr': {
                '$ne': [
                    "", '$LocationName'
                ]
            }
        }
    }, {
        '$project': {
            'LocationName': 1,
            'BGName': 1,
            'CountryName': 1
        }
    }
])

In [None]:
locations_df = pd.DataFrame(list(result))
locations_df = locations_df[locations_df["CountryName"]=='中国']
locations_df = locations_df.groupby(["LocationName", "BGName"]).count()
locations_df.head()

In [None]:
raw_city_list = [i[0] for i in locations_df.index]

# 处理直接使用中文搜索不到的地名
english_dic = {"香港（中国）": "Hongkong"}

# 处理有同名县城的城市名
city_dic = {"中山": "中山市", "乌鲁木齐": "乌鲁木齐市",
            "九江": "九江市", "南京": "南京市",
            "南昌": "南昌市", "南通": "南通市",
            "大连": "大连市", "岳阳": "岳阳市",
            "新乡": "新乡市", "江门": "江门市",
            "洛阳": "洛阳市", "淮安": "淮安市",
            "眉山": "眉山市", "石家庄": "石家庄市",}


process_city_list = list()
for i in raw_city_list:
    if i in english_dic.keys():
        process_city_list.append(english_dic[i])
    elif i in city_dic.keys():
        process_city_list.append(city_dic[i])
    else:
        process_city_list.append(i)

centroids = geocode_cities(process_city_list).get_centroids()

In [None]:
locations_df["city"] = [i[0] for i in locations_df.index]
locations_df["department"] = [i[1] for i in locations_df.index]

In [None]:
ggplot() + geom_livemap(zoom=5) + \
geom_point(aes(size='_id', color="department"), alpha=0.9, \
           data=locations_df, map=centroids, map_join='city', \
           tooltips=layer_tooltips().line('@city')
                                    .line('Job Count|@_id')
                                    .line('Department|@department')) + \
scale_color_brewer(type='qual', palette='Set1') + ggsize(1000, 600)

## 工作内容及岗位要求描述语言分析

### 按词性分析总体情况

In [None]:
import jieba
import jieba.posseg as pseg
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm
import paddle
paddle.enable_static()
jieba.enable_paddle() # 启动paddle模式。 0.40版之后开始支持，早期版本不支持

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
import matplotlib.font_manager as font_manager

# Add every font at the specified location
font_dir = ['fonts']
for font in font_manager.findSystemFonts(font_dir):
    font_manager.fontManager.addfont(font)
    
plt.rcParams['font.family'] = 'FZLanTingHeiS-R-GB'

from collections import OrderedDict

In [None]:
requirement_seg_list = []
description_seg_list = []
for i in tqdm(categories_df.iterrows(), total=len(categories_df)):
    requirement_seg_list += pseg.cut(i[1]['Responsibility'], use_paddle=True) # 使用paddle模式

In [None]:
word_dict = {}
word_dict["requirement"] = OrderedDict()
word_pseg_df = pd.read_csv("word_pseg.csv")
for p in word_pseg_df["pseg"]:
    word_dict["requirement"][p] = []
for i in requirement_seg_list:
    try:
        word_dict["requirement"][i.flag].append(i.word)
    except KeyError:
        pass
word_stat = []
for k, v in word_dict.items():
    for p, wl in v.items():
        word_stat.append({"source": k, "pseg": p, "count": len(wl)})
        
word_stat_df = pd.DataFrame(word_stat)
word_stat_df = word_stat_df.merge(word_pseg_df, on='pseg', how='left')
word_stat_df.head()

In [None]:
ggplot(word_stat_df, aes(x='desc', y='count', fill='source')) + \
geom_bar(stat='identity', position='dodge', tooltips=layer_tooltips().line('@pseg').line('@desc').line('@')) + \
ggsize(1200,700)

In [None]:
wc = WordCloud(
    background_color='white', #设置背景颜色
    width = 2000, height = 250,
    max_words = 500, #设置最大显示的词数
    stopwords = STOPWORDS, #设置停用词
    font_path = 'fonts/FZLTHJW.TTF', 
    max_font_size = 100, #设置字体最大值
    random_state=50, #设置随机生成状态，即多少种配色方案
    )

In [None]:
# plt.rcParams['figure.figsize'] = (10.0, 1000.0)
fig, ax = plt.subplots(28, 1, figsize=(20, 80))
for id_pseg, (p, wl) in tqdm(enumerate(word_dict["requirement"].items()), total=28):
    requirement_wordcloud = wc.generate(' '.join(wl))
    ax[id_pseg].imshow(requirement_wordcloud)
    ax[id_pseg].axis('off')
    ax[id_pseg].set_title(f"Responsibility {word_pseg_df[word_pseg_df['pseg']==p]['desc'].values[0]}")

### 分析不同事业群工作的责任

In [None]:
cat_val = list(categories_df.groupby(["BGName"])["_id"].count().sort_values(ascending=False).index)
seg_cat_dic = {"requirement": OrderedDict()}
for i in seg_cat_dic.keys():
    for j in cat_val:
        seg_cat_dic[i][j] = []

for i in tqdm(categories_df.iterrows(), total=len(categories_df)):
    seg_cat_dic['requirement'][i[1]['BGName']] += pseg.cut(i[1]['Responsibility'], use_paddle=True)

In [None]:
word_dict = {"requirement": OrderedDict()}
care_pseg = ["n", "nz", "v", "vn", "a"]
for i in word_dict.keys():
    for j in cat_val:
        word_dict[i][j] = OrderedDict()
        for k in care_pseg:
            word_dict[i][j][k] = []

for part, content in seg_cat_dic.items():
    for cat, wl in content.items():
        for i in wl:
            try:
                word_dict[part][cat][i.flag].append(i.word)
            except KeyError:
                pass

In [None]:
wc = WordCloud(
    background_color='white', #设置背景颜色
    width = 600, height = 600,
    max_words = 500, #设置最大显示的词数
    stopwords = STOPWORDS, #设置停用词
    font_path = 'fonts/FZLTHJW.TTF', 
    max_font_size = 100, #设置字体最大值
    random_state=50, #设置随机生成状态，即多少种配色方案
    )

In [None]:
for id_part, (part_name, cat_dict) in enumerate(word_dict.items()):
    fig, ax = plt.subplots(len(cat_val), len(care_pseg), figsize=(20, 50))
    for id_cat, (cat_name, pseg_dict) in tqdm(enumerate(cat_dict.items()), total=len(cat_val)):
        for id_pseg, (p, wl) in enumerate(pseg_dict.items()):
            ax[id_cat][id_pseg].axis('off')
            ax[id_cat][id_pseg].set_title(f"{part_name} {cat_name} {word_pseg_df[word_pseg_df['pseg']==p]['desc'].values[0]}")
            if not len(wl):
                continue
            requirement_wordcloud = wc.generate(' '.join(wl))
            ax[id_cat][id_pseg].imshow(requirement_wordcloud)