## 1. 导入相关包

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime, timedelta

%matplotlib inline

## 2. 读取文件，对文件进行相关信息的查看

In [2]:
jobs = pd.read_excel('DataSource_NZSeek/NZ_Admin_JOBS.xlsx', header=None, skiprows=1)

In [5]:
jobs.head()

Unnamed: 0,0,1,2,3,4,5
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at,Private Advertiser",classification: Administration & Office Suppor...
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at",classification: Administration & Office Suppor...
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,location: AucklandAuckland,"4d ago,at",classification: Administration & Office Suppor...
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,location: SouthlandSouthlandarea: Invercargill...,"1h ago,at",classification: Administration & Office Suppor...
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,,location: CanterburyCanterburyarea: Christchur...,"4d ago,at,Private Advertiser",classification: Administration & Office Suppor...


In [3]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       2708 non-null   object
 1   1       2708 non-null   object
 2   2       2686 non-null   object
 3   3       2708 non-null   object
 4   4       2708 non-null   object
 5   5       2708 non-null   object
dtypes: object(6)
memory usage: 127.1+ KB


In [4]:
jobs.describe()

Unnamed: 0,0,1,2,3,4,5
count,2708,2708,2686,2708,2708,2708
unique,548,2708,475,147,54,93
top,Executive Assistant,https://www.seek.co.nz/job/50582301?type=promo...,Beyond Recruitment - Winner – Best Innovation ...,location: AucklandAucklandarea: Auckland Centr...,"27d ago,at",classification: Administration & Office Suppor...
freq,301,1,285,654,572,715


In [6]:
jobs.columns

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [7]:
jobs.dtypes

0    object
1    object
2    object
3    object
4    object
5    object
dtype: object

In [8]:
# 对列名进行重命名
jobs.columns = ['Name', 'Link', 'Department', 'LocationArea', 'ReleaseTime', 'Classification']

In [9]:
jobs

Unnamed: 0,Name,Link,Department,LocationArea,ReleaseTime,Classification
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at,Private Advertiser",classification: Administration & Office Suppor...
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at",classification: Administration & Office Suppor...
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,location: AucklandAuckland,"4d ago,at",classification: Administration & Office Suppor...
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,location: SouthlandSouthlandarea: Invercargill...,"1h ago,at",classification: Administration & Office Suppor...
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,,location: CanterburyCanterburyarea: Christchur...,"4d ago,at,Private Advertiser",classification: Administration & Office Suppor...
...,...,...,...,...,...,...
2703,Key Account Manager,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,location: AucklandAucklandarea: Auckland Centr...,"27d ago,at",classification: Administration & Office Suppor...
2704,Executive Assistant,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,location: WellingtonWellingtonarea: Wellington...,"27d ago,at",Competitive hourly rate
2705,Temporary Office Roles,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,location: WaikatoWaikatoarea: HamiltonHamilton...,"20d ago,at",Competitive hourly rates $$
2706,Temporary Office Roles,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,location: WaikatoWaikatoarea: HamiltonHamilton...,"28d ago,at",Competitive hourly rates $$


## 3. 处理Name列

In [10]:
# 提取Name列
jobs_name = jobs['Name']
jobs_name

0                              Administrator
1                               Receptionist
2               Prosecutions Support Officer
3       Early Childhood Centre Administrator
4             Business Support Administrator
                        ...                 
2703                     Key Account Manager
2704                     Executive Assistant
2705                  Temporary Office Roles
2706                  Temporary Office Roles
2707                     Executive Assistant
Name: Name, Length: 2708, dtype: object

In [11]:
# 检查是否有空值
jobs_name.isnull().any()

False

In [13]:
# 统一处理为首字母大写
jobs_name = jobs_name.str.title()
jobs_name

0                              Administrator
1                               Receptionist
2               Prosecutions Support Officer
3       Early Childhood Centre Administrator
4             Business Support Administrator
                        ...                 
2703                     Key Account Manager
2704                     Executive Assistant
2705                  Temporary Office Roles
2706                  Temporary Office Roles
2707                     Executive Assistant
Name: Name, Length: 2708, dtype: object

In [14]:
jobs['Name'] = jobs_name
jobs

Unnamed: 0,Name,Link,Department,LocationArea,ReleaseTime,Classification
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at,Private Advertiser",classification: Administration & Office Suppor...
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at",classification: Administration & Office Suppor...
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,location: AucklandAuckland,"4d ago,at",classification: Administration & Office Suppor...
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,location: SouthlandSouthlandarea: Invercargill...,"1h ago,at",classification: Administration & Office Suppor...
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,,location: CanterburyCanterburyarea: Christchur...,"4d ago,at,Private Advertiser",classification: Administration & Office Suppor...
...,...,...,...,...,...,...
2703,Key Account Manager,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,location: AucklandAucklandarea: Auckland Centr...,"27d ago,at",classification: Administration & Office Suppor...
2704,Executive Assistant,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,location: WellingtonWellingtonarea: Wellington...,"27d ago,at",Competitive hourly rate
2705,Temporary Office Roles,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,location: WaikatoWaikatoarea: HamiltonHamilton...,"20d ago,at",Competitive hourly rates $$
2706,Temporary Office Roles,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,location: WaikatoWaikatoarea: HamiltonHamilton...,"28d ago,at",Competitive hourly rates $$


## 4. 处理Link列

In [15]:
jobs_link = jobs['Link']
jobs_link

0       https://www.seek.co.nz/job/50582301?type=promo...
1       https://www.seek.co.nz/job/50620889?type=promo...
2       https://www.seek.co.nz/job/50622169?type=stand...
3       https://www.seek.co.nz/job/50639620?type=stand...
4       https://www.seek.co.nz/job/50622432?type=stand...
                              ...                        
2703    https://www.seek.co.nz/job/50490062?type=stand...
2704    https://www.seek.co.nz/job/50488000?type=stand...
2705    https://www.seek.co.nz/job/50524865?type=stand...
2706    https://www.seek.co.nz/job/50477118?type=stand...
2707    https://www.seek.co.nz/job/50496571?type=stand...
Name: Link, Length: 2708, dtype: object

观察发现，Link列中包含了job_id和job_type两个有效信息，因此我们选择使用正则进行提取

In [16]:
jobs_id = jobs_link.apply(lambda x: re.search(r'\d+', x).group())
jobs_id

0       50582301
1       50620889
2       50622169
3       50639620
4       50622432
          ...   
2703    50490062
2704    50488000
2705    50524865
2706    50477118
2707    50496571
Name: Link, Length: 2708, dtype: object

In [17]:
jobs_type = jobs_link.apply(lambda x: re.search(r'type=(\w+)', x).group(1))
jobs_type

0       promoted
1       promoted
2       standard
3       standard
4       standout
          ...   
2703    standout
2704    standout
2705    standout
2706    standout
2707    standout
Name: Link, Length: 2708, dtype: object

In [18]:
# 使用describe()函数查看type种类，发现有3种type
jobs_type.describe()

count         2708
unique           3
top       standout
freq          1877
Name: Link, dtype: object

In [19]:
jobs.insert(jobs.columns.get_loc('Link'), 'JobId', jobs_id)
jobs.insert(jobs.columns.get_loc('Link'), 'JobType', jobs_type)

In [20]:
jobs

Unnamed: 0,Name,JobId,JobType,Link,Department,LocationArea,ReleaseTime,Classification
0,Administrator,50582301,promoted,https://www.seek.co.nz/job/50582301?type=promo...,,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at,Private Advertiser",classification: Administration & Office Suppor...
1,Receptionist,50620889,promoted,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,location: Bay of PlentyBay of Plentyarea: Taur...,"Featured,at",classification: Administration & Office Suppor...
2,Prosecutions Support Officer,50622169,standard,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,location: AucklandAuckland,"4d ago,at",classification: Administration & Office Suppor...
3,Early Childhood Centre Administrator,50639620,standard,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,location: SouthlandSouthlandarea: Invercargill...,"1h ago,at",classification: Administration & Office Suppor...
4,Business Support Administrator,50622432,standout,https://www.seek.co.nz/job/50622432?type=stand...,,location: CanterburyCanterburyarea: Christchur...,"4d ago,at,Private Advertiser",classification: Administration & Office Suppor...
...,...,...,...,...,...,...,...,...
2703,Key Account Manager,50490062,standout,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,location: AucklandAucklandarea: Auckland Centr...,"27d ago,at",classification: Administration & Office Suppor...
2704,Executive Assistant,50488000,standout,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,location: WellingtonWellingtonarea: Wellington...,"27d ago,at",Competitive hourly rate
2705,Temporary Office Roles,50524865,standout,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,location: WaikatoWaikatoarea: HamiltonHamilton...,"20d ago,at",Competitive hourly rates $$
2706,Temporary Office Roles,50477118,standout,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,location: WaikatoWaikatoarea: HamiltonHamilton...,"28d ago,at",Competitive hourly rates $$


## 5. 处理Location列

In [21]:
jobs_location_area = jobs['LocationArea']
jobs_location_area

0       location: Bay of PlentyBay of Plentyarea: Taur...
1       location: Bay of PlentyBay of Plentyarea: Taur...
2                              location: AucklandAuckland
3       location: SouthlandSouthlandarea: Invercargill...
4       location: CanterburyCanterburyarea: Christchur...
                              ...                        
2703    location: AucklandAucklandarea: Auckland Centr...
2704    location: WellingtonWellingtonarea: Wellington...
2705    location: WaikatoWaikatoarea: HamiltonHamilton...
2706    location: WaikatoWaikatoarea: HamiltonHamilton...
2707                       location: WellingtonWellington
Name: LocationArea, Length: 2708, dtype: object

In [24]:
# 判断是否所有信息都包含location
jobs_location_area.str.contains('location').all()

True

In [22]:
# 判断是否所有信息都包含area
jobs_location_area.str.contains('area').all()

False

In [26]:
# 将series转化为df，并且对原LocationArea列进行split，对空值填充
jobs_location_area = pd.DataFrame(jobs_location_area)

jobs_location_area = jobs_location_area.assign(
    Location=(jobs_location_area.LocationArea.str.split('area: ').str[0]),
    Area=jobs_location_area.LocationArea.str.split('area: ').str[1])

jobs_location_area['Area'] = jobs_location_area['Area'].fillna(value=np.nan)

In [27]:
jobs_location_area

Unnamed: 0,LocationArea,Location,Area
0,location: Bay of PlentyBay of Plentyarea: Taur...,location: Bay of PlentyBay of Plenty,TaurangaTauranga
1,location: Bay of PlentyBay of Plentyarea: Taur...,location: Bay of PlentyBay of Plenty,TaurangaTauranga
2,location: AucklandAuckland,location: AucklandAuckland,
3,location: SouthlandSouthlandarea: Invercargill...,location: SouthlandSouthland,InvercargillInvercargill
4,location: CanterburyCanterburyarea: Christchur...,location: CanterburyCanterbury,ChristchurchChristchurch
...,...,...,...
2703,location: AucklandAucklandarea: Auckland Centr...,location: AucklandAuckland,Auckland CentralAuckland Central
2704,location: WellingtonWellingtonarea: Wellington...,location: WellingtonWellington,"Wellington CentralWellington Central,Competiti..."
2705,location: WaikatoWaikatoarea: HamiltonHamilton...,location: WaikatoWaikato,"HamiltonHamilton,Competitive hourly rates $$"
2706,location: WaikatoWaikatoarea: HamiltonHamilton...,location: WaikatoWaikato,"HamiltonHamilton,Competitive hourly rates $$"


现在来处理字符串重复的问题，观察发现，字符串的重复没有什么特殊性，都是子串*2，因此我们对字符串从中间进行split，新生成两列df，判断是否一致，一致则保留一列的数据到原df上，不一致则不进行修改

In [29]:
jobs_location_area['Location'] = jobs_location_area.Location.str.replace('location: ', '', )

In [30]:
# 将字符串从中间位置进行拆分，并且对后半段要判断是否最后还有多余空格
jobs_location_area['Location_1'] = jobs_location_area['Location'].apply(lambda x: x[:len(x) // 2])
jobs_location_area['Location_2'] = jobs_location_area['Location'].apply(lambda x: x[len(x) // 2:])
jobs_location_area['Location_2'] = jobs_location_area['Location_2'].apply(lambda x: x[:-1] if x[-1] == ' ' else x)

In [31]:
# 使用np.where()函数进行判断，拆分的两个片段如果一致则将第一个片段的信息填充到Location列中，不一致则保留Location列中原信息
jobs_location_area['Location'] = np.where(
    jobs_location_area['Location_1'] == jobs_location_area['Location_2'],
    jobs_location_area['Location_1'],
    jobs_location_area['Location'])

In [32]:
jobs_location_area

Unnamed: 0,LocationArea,Location,Area,Location_1,Location_2
0,location: Bay of PlentyBay of Plentyarea: Taur...,Bay of Plenty,TaurangaTauranga,Bay of Plenty,Bay of Plenty
1,location: Bay of PlentyBay of Plentyarea: Taur...,Bay of Plenty,TaurangaTauranga,Bay of Plenty,Bay of Plenty
2,location: AucklandAuckland,Auckland,,Auckland,Auckland
3,location: SouthlandSouthlandarea: Invercargill...,Southland,InvercargillInvercargill,Southland,Southland
4,location: CanterburyCanterburyarea: Christchur...,Canterbury,ChristchurchChristchurch,Canterbury,Canterbury
...,...,...,...,...,...
2703,location: AucklandAucklandarea: Auckland Centr...,Auckland,Auckland CentralAuckland Central,Auckland,Auckland
2704,location: WellingtonWellingtonarea: Wellington...,Wellington,"Wellington CentralWellington Central,Competiti...",Wellington,Wellington
2705,location: WaikatoWaikatoarea: HamiltonHamilton...,Waikato,"HamiltonHamilton,Competitive hourly rates $$",Waikato,Waikato
2706,location: WaikatoWaikatoarea: HamiltonHamilton...,Waikato,"HamiltonHamilton,Competitive hourly rates $$",Waikato,Waikato


In [33]:
jobs_location_area['Area_1'] = jobs_location_area['Area'].apply(lambda x: x[:len(x) // 2] if pd.notnull(x) else x)
jobs_location_area['Area_2'] = jobs_location_area['Area'].apply(lambda x: x[len(x) // 2:] if pd.notnull(x) else x)

In [34]:
jobs_location_area['Area'] = np.where(
    jobs_location_area['Area_1'] == jobs_location_area['Area_2'],
    jobs_location_area['Area_1'],
    jobs_location_area['Area'])

In [35]:
jobs_location_area

Unnamed: 0,LocationArea,Location,Area,Location_1,Location_2,Area_1,Area_2
0,location: Bay of PlentyBay of Plentyarea: Taur...,Bay of Plenty,Tauranga,Bay of Plenty,Bay of Plenty,Tauranga,Tauranga
1,location: Bay of PlentyBay of Plentyarea: Taur...,Bay of Plenty,Tauranga,Bay of Plenty,Bay of Plenty,Tauranga,Tauranga
2,location: AucklandAuckland,Auckland,,Auckland,Auckland,,
3,location: SouthlandSouthlandarea: Invercargill...,Southland,Invercargill,Southland,Southland,Invercargill,Invercargill
4,location: CanterburyCanterburyarea: Christchur...,Canterbury,Christchurch,Canterbury,Canterbury,Christchurch,Christchurch
...,...,...,...,...,...,...,...
2703,location: AucklandAucklandarea: Auckland Centr...,Auckland,Auckland Central,Auckland,Auckland,Auckland Central,Auckland Central
2704,location: WellingtonWellingtonarea: Wellington...,Wellington,"Wellington CentralWellington Central,Competiti...",Wellington,Wellington,Wellington CentralWellington C,"entral,Competitive hourly rate"
2705,location: WaikatoWaikatoarea: HamiltonHamilton...,Waikato,"HamiltonHamilton,Competitive hourly rates $$",Waikato,Waikato,"HamiltonHamilton,Compe",titive hourly rates $$
2706,location: WaikatoWaikatoarea: HamiltonHamilton...,Waikato,"HamiltonHamilton,Competitive hourly rates $$",Waikato,Waikato,"HamiltonHamilton,Compe",titive hourly rates $$


In [36]:
jobs.insert(jobs.columns.get_loc('LocationArea'), 'Location', jobs_location_area['Location'])
jobs.insert(jobs.columns.get_loc('LocationArea'), 'Area', jobs_location_area['Area'])

In [38]:
jobs = jobs.drop(columns='LocationArea')
jobs

Unnamed: 0,Name,JobId,JobType,Link,Department,Location,Area,ReleaseTime,Classification
0,Administrator,50582301,promoted,https://www.seek.co.nz/job/50582301?type=promo...,,Bay of Plenty,Tauranga,"Featured,at,Private Advertiser",classification: Administration & Office Suppor...
1,Receptionist,50620889,promoted,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Bay of Plenty,Tauranga,"Featured,at",classification: Administration & Office Suppor...
2,Prosecutions Support Officer,50622169,standard,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Auckland,,"4d ago,at",classification: Administration & Office Suppor...
3,Early Childhood Centre Administrator,50639620,standard,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Southland,Invercargill,"1h ago,at",classification: Administration & Office Suppor...
4,Business Support Administrator,50622432,standout,https://www.seek.co.nz/job/50622432?type=stand...,,Canterbury,Christchurch,"4d ago,at,Private Advertiser",classification: Administration & Office Suppor...
...,...,...,...,...,...,...,...,...,...
2703,Key Account Manager,50490062,standout,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,Auckland,Auckland Central,"27d ago,at",classification: Administration & Office Suppor...
2704,Executive Assistant,50488000,standout,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,Wellington,"Wellington CentralWellington Central,Competiti...","27d ago,at",Competitive hourly rate
2705,Temporary Office Roles,50524865,standout,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,Waikato,"HamiltonHamilton,Competitive hourly rates $$","20d ago,at",Competitive hourly rates $$
2706,Temporary Office Roles,50477118,standout,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,Waikato,"HamiltonHamilton,Competitive hourly rates $$","28d ago,at",Competitive hourly rates $$


## 6. 处理ReleaseTime列

In [40]:
release_time = pd.DataFrame(jobs['ReleaseTime'])
release_time

Unnamed: 0,ReleaseTime
0,"Featured,at,Private Advertiser"
1,"Featured,at"
2,"4d ago,at"
3,"1h ago,at"
4,"4d ago,at,Private Advertiser"
...,...
2703,"27d ago,at"
2704,"27d ago,at"
2705,"20d ago,at"
2706,"28d ago,at"


In [41]:
release_time['ReleaseTime'].str.contains('at').all()

True

In [42]:
# 有些信息中带有PrivateAdvertiser，因此将其筛选出来，生成新的一列
release_time['PrivateAdvertiser'] = release_time['ReleaseTime'].apply(lambda x: 1 if 'Private Advertiser' in x else 0)

In [43]:
release_time

Unnamed: 0,ReleaseTime,PrivateAdvertiser
0,"Featured,at,Private Advertiser",1
1,"Featured,at",0
2,"4d ago,at",0
3,"1h ago,at",0
4,"4d ago,at,Private Advertiser",1
...,...,...
2703,"27d ago,at",0
2704,"27d ago,at",0
2705,"20d ago,at",0
2706,"28d ago,at",0


In [44]:
# 查看数据中代表时间的缩写标志一共几个
release_time['ReleaseTime'].apply(
    lambda x: re.search(r'(\d+)\s*([a-z]+)', x).group(2) if re.search(r'(\d+)\s*([a-z]+)',
                                                                      x) else np.nan).value_counts()

d    2672
h      11
m       5
Name: ReleaseTime, dtype: int64

In [45]:
# 定义一个时间转换的函数
def datetime_cal(time_str):
    time_match = re.search(r'(\d+)\s*([a-z]+)', time_str)
    if not time_match:
        return np.nan

    time_value = int(time_match.group(1))
    time_unit = time_match.group(2)

    if time_unit == 'd':
        datetime = pd.to_datetime('today') - timedelta(days=time_value)
    elif time_unit == 'h':
        datetime = pd.to_datetime('today') - timedelta(hours=time_value)
    elif time_unit == 'm':
        datetime = pd.to_datetime('today') - timedelta(minutes=time_value)
    else:
        return np.nan

    return datetime.strftime('%Y-%m-%d %H:%M')

In [46]:
release_time['Datetime'] = release_time['ReleaseTime'].apply(datetime_cal)

In [47]:
release_time

Unnamed: 0,ReleaseTime,PrivateAdvertiser,Datetime
0,"Featured,at,Private Advertiser",1,
1,"Featured,at",0,
2,"4d ago,at",0,2022-12-23 13:11
3,"1h ago,at",0,2022-12-27 12:11
4,"4d ago,at,Private Advertiser",1,2022-12-23 13:11
...,...,...,...
2703,"27d ago,at",0,2022-11-30 13:11
2704,"27d ago,at",0,2022-11-30 13:11
2705,"20d ago,at",0,2022-12-07 13:11
2706,"28d ago,at",0,2022-11-29 13:11


In [48]:
# 观察发现如果ReleaseTime中不是标注为时间，则只有Featured，因此也将Featured进行筛选，生成新的一列
release_time['Featured'] = release_time['ReleaseTime'].apply(lambda x: 1 if 'Featured' in x else 0)

In [49]:
release_time

Unnamed: 0,ReleaseTime,PrivateAdvertiser,Datetime,Featured
0,"Featured,at,Private Advertiser",1,,1
1,"Featured,at",0,,1
2,"4d ago,at",0,2022-12-23 13:11,0
3,"1h ago,at",0,2022-12-27 12:11,0
4,"4d ago,at,Private Advertiser",1,2022-12-23 13:11,0
...,...,...,...,...
2703,"27d ago,at",0,2022-11-30 13:11,0
2704,"27d ago,at",0,2022-11-30 13:11,0
2705,"20d ago,at",0,2022-12-07 13:11,0
2706,"28d ago,at",0,2022-11-29 13:11,0


In [50]:
jobs.insert(jobs.columns.get_loc('ReleaseTime'), 'PrivateAdvertiser', release_time['PrivateAdvertiser'])
jobs.insert(jobs.columns.get_loc('ReleaseTime'), 'Featured', release_time['Featured'])
jobs.insert(jobs.columns.get_loc('ReleaseTime'), 'Datetime', release_time['Datetime'])

In [51]:
jobs = jobs.drop(columns='ReleaseTime')

In [52]:
jobs

Unnamed: 0,Name,JobId,JobType,Link,Department,Location,Area,PrivateAdvertiser,Featured,Datetime,Classification
0,Administrator,50582301,promoted,https://www.seek.co.nz/job/50582301?type=promo...,,Bay of Plenty,Tauranga,1,1,,classification: Administration & Office Suppor...
1,Receptionist,50620889,promoted,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Bay of Plenty,Tauranga,0,1,,classification: Administration & Office Suppor...
2,Prosecutions Support Officer,50622169,standard,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Auckland,,0,0,2022-12-23 13:11,classification: Administration & Office Suppor...
3,Early Childhood Centre Administrator,50639620,standard,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Southland,Invercargill,0,0,2022-12-27 12:11,classification: Administration & Office Suppor...
4,Business Support Administrator,50622432,standout,https://www.seek.co.nz/job/50622432?type=stand...,,Canterbury,Christchurch,1,0,2022-12-23 13:11,classification: Administration & Office Suppor...
...,...,...,...,...,...,...,...,...,...,...,...
2703,Key Account Manager,50490062,standout,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,Auckland,Auckland Central,0,0,2022-11-30 13:11,classification: Administration & Office Suppor...
2704,Executive Assistant,50488000,standout,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,Wellington,"Wellington CentralWellington Central,Competiti...",0,0,2022-11-30 13:11,Competitive hourly rate
2705,Temporary Office Roles,50524865,standout,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,Waikato,"HamiltonHamilton,Competitive hourly rates $$",0,0,2022-12-07 13:11,Competitive hourly rates $$
2706,Temporary Office Roles,50477118,standout,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,Waikato,"HamiltonHamilton,Competitive hourly rates $$",0,0,2022-11-29 13:11,Competitive hourly rates $$


## 7. 处理Classification列

In [59]:
cls = pd.DataFrame(jobs.Classification)
cls

Unnamed: 0,Classification
0,classification: Administration & Office Suppor...
1,classification: Administration & Office Suppor...
2,classification: Administration & Office Suppor...
3,classification: Administration & Office Suppor...
4,classification: Administration & Office Suppor...
...,...
2703,classification: Administration & Office Suppor...
2704,Competitive hourly rate
2705,Competitive hourly rates $$
2706,Competitive hourly rates $$


In [60]:
cls['Classification'] = cls['Classification'].apply(lambda x: x.lower())

In [61]:
cls[["class", "subclass"]] = cls["Classification"].str.extract(r"classification: (.*)subclassification: (.*)")

# 将新列赋值为 NaN，如果原来的列没有匹配项
cls["class"].fillna(value=np.nan, inplace=True)
cls["subclass"].fillna(value=np.nan, inplace=True)

In [62]:
cls

Unnamed: 0,Classification,class,subclass
0,classification: administration & office suppor...,administration & office supportadministration ...,office managementoffice management
1,classification: administration & office suppor...,administration & office supportadministration ...,receptionistsreceptionists
2,classification: administration & office suppor...,administration & office supportadministration ...,otherother
3,classification: administration & office suppor...,administration & office supportadministration ...,administrative assistantsadministrative assist...
4,classification: administration & office suppor...,administration & office supportadministration ...,client & sales administrationclient & sales ad...
...,...,...,...
2703,classification: administration & office suppor...,administration & office supportadministration ...,client & sales administrationclient & sales ad...
2704,competitive hourly rate,,
2705,competitive hourly rates $$,,
2706,competitive hourly rates $$,,


In [63]:
cls['Classification'] = cls["Classification"].apply(
    lambda x: np.nan if re.search(r"classification: (.*)subclassification: (.*)", x) else x)

In [64]:
cls

Unnamed: 0,Classification,class,subclass
0,,administration & office supportadministration ...,office managementoffice management
1,,administration & office supportadministration ...,receptionistsreceptionists
2,,administration & office supportadministration ...,otherother
3,,administration & office supportadministration ...,administrative assistantsadministrative assist...
4,,administration & office supportadministration ...,client & sales administrationclient & sales ad...
...,...,...,...
2703,,administration & office supportadministration ...,client & sales administrationclient & sales ad...
2704,competitive hourly rate,,
2705,competitive hourly rates $$,,
2706,competitive hourly rates $$,,


In [65]:
# 用于检查class和subclass是否同时存在或者同时不存在，如果判断为真，之后处理class和subclass将会简单一些
((cls['class'].isnull() & cls['subclass'].isnull()) | (cls['class'].notnull() & cls['subclass'].notnull())).all()

True

In [66]:
# 定义一个函数，用于对class和subclass进行去重
def class_drop_duplicate(class_str):
    if class_str != np.nan:
        class_str = str(class_str)
        if class_str[-1] == " ":
            class_str = class_str.replace(class_str[-1], "")
        return class_str[:len(class_str) // 2] \
            if class_str[:len(class_str) // 2] == class_str[len(class_str) // 2:] \
            else class_str
    else:
        return class_str

In [67]:
cls['class'] = cls['class'].apply(class_drop_duplicate)
cls['subclass'] = cls['subclass'].apply(class_drop_duplicate)

In [68]:
cls

Unnamed: 0,Classification,class,subclass
0,,administration & office support,office management
1,,administration & office support,receptionists
2,,administration & office support,other
3,,administration & office support,administrative assistants
4,,administration & office support,client & sales administration
...,...,...,...
2703,,administration & office support,client & sales administration
2704,competitive hourly rate,,
2705,competitive hourly rates $$,,
2706,competitive hourly rates $$,,


In [69]:
jobs.insert(jobs.columns.get_loc('Classification'), 'Class', cls['class'])
jobs.insert(jobs.columns.get_loc('Classification'), 'Subclass', cls['subclass'])
# 因为Classification列中还有些信息，不存在共性，因此将其变成备注比较合适
jobs.insert(jobs.columns.get_loc('Classification'), 'Remark', cls['Classification'])

In [70]:
jobs = jobs.drop(columns = 'Classification')

In [71]:
jobs

Unnamed: 0,Name,JobId,JobType,Link,Department,Location,Area,PrivateAdvertiser,Featured,Datetime,Class,Subclass,Remark
0,Administrator,50582301,promoted,https://www.seek.co.nz/job/50582301?type=promo...,,Bay of Plenty,Tauranga,1,1,,administration & office support,office management,
1,Receptionist,50620889,promoted,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Bay of Plenty,Tauranga,0,1,,administration & office support,receptionists,
2,Prosecutions Support Officer,50622169,standard,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Auckland,,0,0,2022-12-23 13:11,administration & office support,other,
3,Early Childhood Centre Administrator,50639620,standard,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Southland,Invercargill,0,0,2022-12-27 12:11,administration & office support,administrative assistants,
4,Business Support Administrator,50622432,standout,https://www.seek.co.nz/job/50622432?type=stand...,,Canterbury,Christchurch,1,0,2022-12-23 13:11,administration & office support,client & sales administration,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,Key Account Manager,50490062,standout,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,Auckland,Auckland Central,0,0,2022-11-30 13:11,administration & office support,client & sales administration,
2704,Executive Assistant,50488000,standout,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,Wellington,"Wellington CentralWellington Central,Competiti...",0,0,2022-11-30 13:11,,,competitive hourly rate
2705,Temporary Office Roles,50524865,standout,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,Waikato,"HamiltonHamilton,Competitive hourly rates $$",0,0,2022-12-07 13:11,,,competitive hourly rates $$
2706,Temporary Office Roles,50477118,standout,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,Waikato,"HamiltonHamilton,Competitive hourly rates $$",0,0,2022-11-29 13:11,,,competitive hourly rates $$


## 8. 数据与处理完毕，输出文件

In [72]:
jobs.to_excel('DataSource_NZSeek/NZ_Admin_JOBS_PREPROCESSED.xlsx')
jobs.to_csv('DataSource_NZSeek/NZ_Admin_JOBS_PREPROCESSED.csv')