# Job Title Analysis
1. Sanity checks of CIF_NO
2. Explore with ngrams
3. Identify Management Roles ```Is_Management```
4. Scrape Indeed for the salary of common job titles

In [14]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_colwidth', 100)
pd.set_option("display.max_rows", 500)

jobs_df = pd.read_excel('../data/JobTitle.xlsx')
transaction_df = pd.read_excel('../data/TRANSACTION DATA OF BANK X_OCT DES 2019.xlsx')

## 1. Comparing the two sets of CIF_NO
Jobs data contain all CIF_NO in Transaction data

In [3]:
transaction_unique_CIF = set(transaction_df.CIF_NO)
jobs_unique_CIF = set(jobs_df.CIF_NO)

In [4]:
print(f'CIF_NO count in transaction_df:\t {len(transaction_unique_CIF)}')
print(f'CIF_NO count in jobs_df:\t {len(jobs_unique_CIF)}')
CIF_intersection = transaction_unique_CIF.intersection(jobs_unique_CIF)
print(f'Intersection:\t {len(CIF_intersection)}')

CIF_NO count in transaction_df:	 8561
CIF_NO count in jobs_df:	 8564
Intersection:	 8561


## 2. Explore with ngrams
Exploring the job titles with different range of ngrams to find the common terms.

In [15]:
unique_jobs = jobs_df.groupby('Job Title', as_index=False)\
                .count()\
                .sort_values('CIF_NO', ascending=False)\
                .reset_index(drop=True)\
                .rename(columns={'CIF_NO':'CIF_COUNT'})

In [20]:
def cv_ngrams(n,n1):
    cv = CountVectorizer(ngram_range=(n,n1))
    cv_fit = cv.fit_transform(unique_jobs['Job Title'])
    word_list = cv.get_feature_names()
    count_list = cv_fit.toarray().sum(axis=0)
    word_count = dict(zip(word_list,count_list))
    return pd.DataFrame([[k,v] for k, v in sorted(word_count.items(), key=lambda item: item[1], reverse=True)], columns=['Ngrams','Count'])
cv_ngrams(1,1)

Unnamed: 0,Ngrams,Count
0,head,215
1,officer,201
2,staff,148
3,department,125
4,business,108
5,management,102
6,leader,71
7,team,71
8,development,61
9,financing,49


In [21]:
cv_ngrams(2,2)

Unnamed: 0,Ngrams,Count
0,department head,121
1,team leader,71
2,section head,49
3,group head,36
4,business development,26
5,management staff,25
6,management department,20
7,development officer,15
8,management officer,15
9,management section,15


In [22]:
cv_ngrams(2,3)

Unnamed: 0,Ngrams,Count
0,department head,121
1,team leader,71
2,section head,49
3,group head,36
4,business development,26
5,management staff,25
6,management department,20
7,management department head,20
8,development officer,15
9,management officer,15


## 3. Identify Management Roles ```Is_Management```

From the previous exploration using ngrams, it can be observed that a few terms will always suggest a management role for the job. (i.e. ```manager```, ```team leader```, ```head```)

In [9]:
unique_jobs['IsManagement'] = np.where(((unique_jobs['Job Title'].str.contains('manager', case=False) == True)\
                                   | (unique_jobs['Job Title'].str.contains('team leader', case=False) == True)\
                                   | (unique_jobs['Job Title'].str.contains('head', case=False) == True))\
                                    , 1, 0)

In [10]:
unique_jobs.head()

Unnamed: 0,Job Title,CIF_COUNT,IsManagement
0,Teller,709,0
1,Customer Service,696,0
2,Back Office,558,0
3,Branch Operations & Service Manager,512,1
4,Branch Manager,510,1


## 4. Scrape Indeed for the salary of common job titles
### Pareto Analysis
Using Pareto Analysis, we can determine the least number of job titles that 80% of CIF_NO holds.
With this list, we can scrape the indeed website to obtain more data (e.g. Salary).

In [13]:
# unique_jobs['CumSumPct'] = 
unique_jobs.CIF_COUNT\
.sort_values('CIF_COUNT',ascending=False)\
.cumsum() / unique_jobs.CIF_COUNT.sum() * 100

NameError: name 'unique_jobs' is not defined

In [12]:
unique_jobs[unique_jobs['CumSumPct'] < 80]

Unnamed: 0,Job Title,CIF_COUNT,IsManagement,CumSumPct
0,Teller,709,0,8.279808
1,Customer Service,696,0,16.407801
2,Back Office,558,0,22.924209
3,Branch Operations & Service Manager,512,1,28.903422
4,Branch Manager,510,1,34.859278
5,Retail Banking Relationship Manager,436,1,39.950952
6,Pawning Staff,415,0,44.797384
7,Retail Banking Representative,313,0,48.452645
8,Micro Analyst,279,0,51.710849
9,Micro Banking Manager,245,1,54.571996


In [4]:
import time 
import requests
from bs4 import BeautifulSoup
from random import randint
from statistics import mean
import pandas as pd

# unique_jobs[unique_jobs['CumSumPct'] < 80].to_csv('../data/jobs_list.csv')
# jobs_list = list(unique_jobs[unique_jobs['CumSumPct'] < 80]['Job Title'])
jobs_list = list(pd.read_csv('../data/jobs_list.csv')['Job Title'])

In [5]:
MAX_PAGE = 20
results_list = []

for job in jobs_list:
    job_string = job.replace('&','').replace(' ','+').lower()
    done=False
    for page_no in range(MAX_PAGE):
        if done==False:
            wait_time = randint(1, 5)

            # get dom
            url = 'https://id.indeed.com/jobs?q=' + job_string + '&start=' + str(int(page_no*10)) + '&sort=date'
            page = requests.get(url)

            #fetch data
            soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
            divs = soup.find_all(name="div", attrs={"class":"row"})
            main = soup.find_all(name='div', attrs={'class': 'row', 'class': 'result'})
            total_posts = len(main)
            print(f'Scraping Job: {job}, Page: {page_no+1}, Wait Time: {wait_time}, Posts: {total_posts}')
            if len(main) <= 3:
                done = True
                break
            else:

                for div in main:

                    try:
                        # Title
                        title = div.find(name='a', attrs={'data-tn-element': 'jobTitle'}).text.replace('\n', '')

                        # Company's Name
                        company = div.find(name='span', attrs={'class': 'company'})
                        if company is not None:
                            company = company.text.replace('\n', '')
                        else: company = ''

                        # Location Name
                        location = div.find(name='span', attrs={'class': 'location accessible-contrast-color-location'})
                        if location is not None:
                            location = location.text
                        else: location = ''

                        # Salary
                        salary = div.find(attrs={'class': 'salaryText'})
                        if salary is not None:
                            salary = salary.text.replace('\n', '')
                        else: salary = ''

                        results_list.append([job,title,company,location,salary])
                    except:
                        pass
                time.sleep(wait_time)




Scraping Job: Teller, Page: 1, Wait Time: 5, Posts: 10
Scraping Job: Teller, Page: 2, Wait Time: 5, Posts: 10
Scraping Job: Teller, Page: 3, Wait Time: 2, Posts: 10
Scraping Job: Teller, Page: 4, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 5, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 6, Wait Time: 4, Posts: 5
Scraping Job: Teller, Page: 7, Wait Time: 5, Posts: 10
Scraping Job: Teller, Page: 8, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 9, Wait Time: 2, Posts: 10
Scraping Job: Teller, Page: 10, Wait Time: 5, Posts: 10
Scraping Job: Teller, Page: 11, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 12, Wait Time: 3, Posts: 10
Scraping Job: Teller, Page: 13, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 14, Wait Time: 3, Posts: 10
Scraping Job: Teller, Page: 15, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 16, Wait Time: 1, Posts: 10
Scraping Job: Teller, Page: 17, Wait Time: 2, Posts: 10
Scraping Job: Teller, Page: 18, Wait Time: 5, Posts: 10
Sc

Scraping Job: Retail Banking Relationship Manager, Page: 20, Wait Time: 4, Posts: 10
Scraping Job: Pawning Staff, Page: 1, Wait Time: 3, Posts: 0
Scraping Job: Retail Banking Representative, Page: 1, Wait Time: 4, Posts: 2
Scraping Job: Micro Analyst, Page: 1, Wait Time: 1, Posts: 4
Scraping Job: Micro Analyst, Page: 2, Wait Time: 5, Posts: 4
Scraping Job: Micro Analyst, Page: 3, Wait Time: 5, Posts: 4
Scraping Job: Micro Analyst, Page: 4, Wait Time: 4, Posts: 4
Scraping Job: Micro Analyst, Page: 5, Wait Time: 3, Posts: 4
Scraping Job: Micro Analyst, Page: 6, Wait Time: 3, Posts: 4
Scraping Job: Micro Analyst, Page: 7, Wait Time: 2, Posts: 4
Scraping Job: Micro Analyst, Page: 8, Wait Time: 4, Posts: 4
Scraping Job: Micro Analyst, Page: 9, Wait Time: 3, Posts: 4
Scraping Job: Micro Analyst, Page: 10, Wait Time: 1, Posts: 4
Scraping Job: Micro Analyst, Page: 11, Wait Time: 5, Posts: 4
Scraping Job: Micro Analyst, Page: 12, Wait Time: 5, Posts: 4
Scraping Job: Micro Analyst, Page: 13, Wai

Scraping Job: Business Banking Relationship Manager, Page: 20, Wait Time: 2, Posts: 11
Scraping Job: Business Control Staff, Page: 1, Wait Time: 5, Posts: 10
Scraping Job: Business Control Staff, Page: 2, Wait Time: 3, Posts: 10
Scraping Job: Business Control Staff, Page: 3, Wait Time: 5, Posts: 10
Scraping Job: Business Control Staff, Page: 4, Wait Time: 4, Posts: 10
Scraping Job: Business Control Staff, Page: 5, Wait Time: 1, Posts: 10
Scraping Job: Business Control Staff, Page: 6, Wait Time: 3, Posts: 10
Scraping Job: Business Control Staff, Page: 7, Wait Time: 2, Posts: 10
Scraping Job: Business Control Staff, Page: 8, Wait Time: 2, Posts: 10
Scraping Job: Business Control Staff, Page: 9, Wait Time: 5, Posts: 10
Scraping Job: Business Control Staff, Page: 10, Wait Time: 4, Posts: 10
Scraping Job: Business Control Staff, Page: 11, Wait Time: 1, Posts: 10
Scraping Job: Business Control Staff, Page: 12, Wait Time: 5, Posts: 10
Scraping Job: Business Control Staff, Page: 13, Wait Time:

Scraping Job: Commercial Banking Relationship Manager, Page: 17, Wait Time: 4, Posts: 10
Scraping Job: Commercial Banking Relationship Manager, Page: 18, Wait Time: 3, Posts: 10
Scraping Job: Commercial Banking Relationship Manager, Page: 19, Wait Time: 2, Posts: 10
Scraping Job: Commercial Banking Relationship Manager, Page: 20, Wait Time: 1, Posts: 10
Scraping Job: Area Collection & Recovery Manager, Page: 1, Wait Time: 5, Posts: 2
Scraping Job: Business Banking Staff, Page: 1, Wait Time: 5, Posts: 10
Scraping Job: Business Banking Staff, Page: 2, Wait Time: 2, Posts: 10
Scraping Job: Business Banking Staff, Page: 3, Wait Time: 3, Posts: 10
Scraping Job: Business Banking Staff, Page: 4, Wait Time: 2, Posts: 10
Scraping Job: Business Banking Staff, Page: 5, Wait Time: 1, Posts: 10
Scraping Job: Business Banking Staff, Page: 6, Wait Time: 2, Posts: 10
Scraping Job: Business Banking Staff, Page: 7, Wait Time: 2, Posts: 10
Scraping Job: Business Banking Staff, Page: 8, Wait Time: 4, Post

### Saved raw data
Raw data is saved in case anyone wants to process the raw data in another way.

In [None]:
import pickle

with open('../data/scraped_jobs.pickle', 'wb') as handle:
    pickle.dump(results_list, handle)

Not many of the jobs scraped have salary data.

In [25]:
results_df = pd.DataFrame(results_list, columns=['QUERY','TITLE','COMPANY','LOCATION','SALARY']).drop_duplicates()
results_df.head(10)

Unnamed: 0,QUERY,TITLE,COMPANY,LOCATION,SALARY
0,Teller,Teller,PT BPR Pundi Masyarakat,Batam,
1,Teller,Customer Service Officer (CSO)/Teller (Sidoarjo),BPR Surasari Hutama,Sidoarjo,Rp. 4.000.000 - Rp. 4.250.000 per bulan
2,Teller,Teller Dwidaya Tour - Taman Anggrek,PT. Mutualplus Global Resources,Jakarta,Rp. 4.500.000 - Rp. 4.800.000 per bulan
3,Teller,Admin/Customer Service Officer (CSO),iPhoneBali Store & Service Centre,Kuta,
4,Teller,FRONTLINER,"PT Bank Mega, Tbk",Jakarta,
5,Teller,Teller,PT Bank Danamon Indonesia Tbk,East Java,
6,Teller,Teller,PT BPR Universal,Jakarta,
7,Teller,CUSTOMER SERVICE OFFICER (CSO) - JAKARTA,PT Altrak 1978,Jakarta,Rp. 4.300.000 - Rp. 4.500.000 per bulan
8,Teller,Customer Service / Teller BRI - Program Internship,PT. Mutualplus Global Resources,Jakarta,Rp. 4.200.000 - Rp. 4.500.000 per bulan
9,Teller,Staff Admin Online CSO,agenbajumurah,Tangerang,Rp. 1.700.000 - Rp. 4.300.000 per bulan


### Data Processing
Mean of the salary is used if there is a range provided.

In [26]:
results_df = results_df[results_df.SALARY != '']
results_df.SALARY = results_df.SALARY.apply(lambda x: x.replace(' per bulan)','').replace(' ','').replace('Rp.','').replace('perbulan','').replace('.','').split('-'))
results_df.SALARY = results_df.SALARY.apply(lambda x: mean([float(i) for i in x]))
results_df.reset_index(drop=True)

Unnamed: 0,QUERY,TITLE,COMPANY,LOCATION,SALARY
0,Teller,Customer Service Officer (CSO)/Teller (Sidoarjo),BPR Surasari Hutama,Sidoarjo,4125000.0
1,Teller,Teller Dwidaya Tour - Taman Anggrek,PT. Mutualplus Global Resources,Jakarta,4650000.0
2,Teller,CUSTOMER SERVICE OFFICER (CSO) - JAKARTA,PT Altrak 1978,Jakarta,4400000.0
3,Teller,Customer Service / Teller BRI - Program Internship,PT. Mutualplus Global Resources,Jakarta,4350000.0
4,Teller,Staff Admin Online CSO,agenbajumurah,Tangerang,3000000.0
5,Teller,[LOKER KLATEN] CSO/ADMIN,Robota,Klaten,2250000.0
6,Teller,CUSTOMER SERVICE,PT. SOS Indonesia,Bogor,3850000.0
7,Teller,FRONTLINER BRI (Cibinong),PT. Mutualplus Global Resources,Bogor,4500000.0
8,Teller,Contact Center ANTI Attrition - Jakarta,Mitracomm Ekasarana PT,Jakarta,4500000.0
9,Teller,"Account Officer, Sales Officer, Credit Analys, Marketing Mik...",PT. BPR KREDIT MANDIRI INDONESIA,Majalengka,2250000.0
