# Exploratory Data Analysis

In [71]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.linear_model import LogisticRegression

In [2]:
jobs = pd.read_csv("scrapedjobs.csv")
                   
jobs.head()

Unnamed: 0.1,Unnamed: 0,company,title,location,seniority,category,salary,salary_period
0,0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",South,Executive,Information Technology,"$7,700to$15,400",Monthly
1,1,KIMBERLEY CONSULTING PTE. LTD.,Data Analyst,South,Executive,Information Technology,Salary undisclosed,
2,2,DBS BANK LTD.,"AVP, Data Scientist, Business Analytics, Consu...",Central,Executive,Information Technology,"$6,500to$11,700",Monthly
3,3,NIOMETRICS (PTE.) LTD.,High-Performance Data Engineer,Central,Executive,Information Technology,"$5,500to$11,000",Monthly
4,4,NIOMETRICS (PTE.) LTD.,Data Scientist,East,Senior Management,Information Technology,"$5,000to$10,000",Monthly


In [3]:
jobs.tail()

Unnamed: 0.1,Unnamed: 0,company,title,location,seniority,category,salary,salary_period
1395,1395,Company Undisclosed,Senior Software Developer,South,Executive,F&B,Salary undisclosed,
1396,1396,Company Undisclosed,Software Developer,Central,Junior Executive,Information Technology,Salary undisclosed,
1397,1397,Accountant-General's Department,Head (Data Architect),Central,Junior Executive,Information Technology,Salary undisclosed,
1398,1398,CAPGEMINI SINGAPORE PTE. LTD.,Data Engineer,Islandwide,Fresh/entry level,Information Technology,"$5,000to$7,500",Monthly
1399,1399,CAPGEMINI SINGAPORE PTE. LTD.,Data Manager,Islandwide,Fresh/entry level,Information Technology,"$5,000to$7,500",Monthly


In [4]:
# doing initial checks on data

jobs.isnull().sum()

Unnamed: 0         0
company            0
title              0
location           0
seniority          0
category           0
salary             0
salary_period    138
dtype: int64

In [5]:
# dropping 'unknown' column and rows where we do not have salary 

jobs.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
jobs.dropna(inplace=True)
jobs.head()

Unnamed: 0,company,title,location,seniority,category,salary,salary_period
0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",South,Executive,Information Technology,"$7,700to$15,400",Monthly
2,DBS BANK LTD.,"AVP, Data Scientist, Business Analytics, Consu...",Central,Executive,Information Technology,"$6,500to$11,700",Monthly
3,NIOMETRICS (PTE.) LTD.,High-Performance Data Engineer,Central,Executive,Information Technology,"$5,500to$11,000",Monthly
4,NIOMETRICS (PTE.) LTD.,Data Scientist,East,Senior Management,Information Technology,"$5,000to$10,000",Monthly
5,NATIONAL UNIVERSITY OF SINGAPORE,Data Scientist,East,Senior Management,Information Technology,"$5,000to$7,500",Monthly


In [7]:
jobs.shape

(1262, 7)

In [8]:
#  so this will be the final dataframe I want to use for my analysis
#  On initial observation , all the data is text
#  need to parse the salary data (it is in range) to minimum and maximum salary
#  and then take the average to make my target salary


# new data frame with split value columns 
new = jobs["salary"].str.split("to", n = 1, expand = True) 
  
# making seperate first name column from new data frame 
jobs["lower salary"]= new[0] 
  
# making seperate last name column from new data frame 
jobs["higher salary"]= new[1] 
  
# Dropping old Name columns 
jobs.drop(columns =["salary"], inplace = True) 

jobs.head()

Unnamed: 0,company,title,location,seniority,category,salary_period,lower salary,higher salary
0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",South,Executive,Information Technology,Monthly,"$7,700","$15,400"
2,DBS BANK LTD.,"AVP, Data Scientist, Business Analytics, Consu...",Central,Executive,Information Technology,Monthly,"$6,500","$11,700"
3,NIOMETRICS (PTE.) LTD.,High-Performance Data Engineer,Central,Executive,Information Technology,Monthly,"$5,500","$11,000"
4,NIOMETRICS (PTE.) LTD.,Data Scientist,East,Senior Management,Information Technology,Monthly,"$5,000","$10,000"
5,NATIONAL UNIVERSITY OF SINGAPORE,Data Scientist,East,Senior Management,Information Technology,Monthly,"$5,000","$7,500"


In [9]:
# counting the title frequencies

jobs["title"].value_counts()


Data Scientist                                                        24
Accounts Executive                                                    12
Accountant                                                             9
Research Fellow                                                        8
Software Engineer                                                      8
Data Analyst                                                           8
Software Consultant                                                    7
Senior Manager                                                         7
Administrative Assistant                                               7
Business Analyst                                                       7
Research Assistant                                                     7
Production Planner (5 days, Clementi,  $1800-2000)                     6
Manager                                                                6
Accounts  Officer (3-6 months contract, Science Par

In [10]:

jobs["company"].value_counts()

Company Undisclosed                                           151
NATIONAL UNIVERSITY OF SINGAPORE                               27
THE SUPREME HR ADVISORY PTE. LTD.                              24
GOVERNMENT TECHNOLOGY AGENCY                                   20
NANYANG TECHNOLOGICAL UNIVERSITY                               20
ERNST & YOUNG ADVISORY PTE. LTD.                               20
MACHSPEED HUMAN RESOURCES PTE. LTD.                            17
GOOGLE ASIA PACIFIC PTE. LTD.                                  16
OPTIMUM SOLUTIONS (SINGAPORE) PTE LTD                          15
A*STAR RESEARCH ENTITIES                                       13
LIBERTY WIRELESS PTE. LTD.                                     12
MANPOWER STAFFING SERVICES (SINGAPORE) PTE LTD                 11
CHARTERHOUSE PTE. LTD.                                         11
DBS BANK LTD.                                                  10
NIOMETRICS (PTE.) LTD.                                         10
Institute 

In [11]:
jobs["seniority"].unique()

array(['Executive', 'Senior Management', 'Professional', 'Non-executive',
       'Junior Executive', 'Senior Executive', 'Executive ...',
       'Fresh/entry level ...', 'Manager', 'Fresh/entry level',
       'Manager ...', 'Senior Management ...', 'Middle Management',
       'Professional ...', 'Middle Management ...', 'Non-executive ...'],
      dtype=object)

In [12]:
# counting the title frequencies

jobs["category"].value_counts()


Information Technology                  327
Engineering                             102
Accounting / Auditing / Taxation         69
Engineering ...                          65
Banking and Finance                      53
Others                                   51
Information Technology ...               50
Human Resources                          46
Sciences / Laboratory / R&D              39
Admin / Secretarial                      34
Accounting / Auditing / Taxation ...     31
Advertising / Media                      29
Marketing / Public Relations             27
Sales / Retail                           24
Consulting  ...                          24
Admin / Secretarial ...                  24
Consulting                               23
Logistics / Supply Chain                 21
Logistics / Supply Chain ...             16
Public / Civil Service                   16
Building and Construction                15
Banking and Finance ...                  13
Advertising / Media  ...        

In [13]:
jobs["category"] = (jobs["category"].replace('\.','', regex=True).replace(' ','',regex=True))
                        
jobs["category"].value_counts()

InformationTechnology            377
Engineering                      167
Accounting/Auditing/Taxation     100
BankingandFinance                 66
Others                            58
Admin/Secretarial                 58
HumanResources                    51
Consulting                        47
Advertising/Media                 42
Sciences/Laboratory/R&D           39
Logistics/SupplyChain             37
Marketing/PublicRelations         34
Sales/Retail                      24
CustomerService                   23
BuildingandConstruction           19
EducationandTraining              16
Public/CivilService               16
Design                            11
Manufacturing                     10
F&B                                9
Telecommunications                 7
Purchasing/Merchandising           6
RepairandMaintenance               6
Hospitality                        6
Healthcare/Pharmaceutical          6
GeneralManagement                  6
SocialServices                     5
E

In [14]:
jobs["seniority"] = (jobs["seniority"].replace('\.','', regex=True).replace(' ','',regex=True))
                        
jobs["seniority"].value_counts()




Executive           302
Professional        224
Manager             158
SeniorExecutive     139
Non-executive        99
JuniorExecutive      96
Fresh/entrylevel     96
MiddleManagement     91
SeniorManagement     57
Name: seniority, dtype: int64

In [25]:
jobs["location"].value_counts()

Central                 491
East                    189
South                   179
West                    162
Islandwide              119
North                    89
East, Central            11
West, Central             4
North, Central            4
South, Central            3
South, East, Central      3
East, West, Central       2
North, East               2
North, West               2
South, East               2
Name: location, dtype: int64

In [29]:

jobs['location_new'] = jobs['location'].str.split(',').str[0]

jobs.head()



Unnamed: 0,company,title,location,seniority,category,salary_period,lower salary,higher salary,Average_salary,above median,location_new
0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",South,Executive,InformationTechnology,Monthly,7700.0,15400.0,11550.0,1,South
2,DBS BANK LTD.,"AVP, Data Scientist, Business Analytics, Consu...",Central,Executive,InformationTechnology,Monthly,6500.0,11700.0,9100.0,1,Central
3,NIOMETRICS (PTE.) LTD.,High-Performance Data Engineer,Central,Executive,InformationTechnology,Monthly,5500.0,11000.0,8250.0,1,Central
4,NIOMETRICS (PTE.) LTD.,Data Scientist,East,SeniorManagement,InformationTechnology,Monthly,5000.0,10000.0,7500.0,1,East
5,NATIONAL UNIVERSITY OF SINGAPORE,Data Scientist,East,SeniorManagement,InformationTechnology,Monthly,5000.0,7500.0,6250.0,1,East


In [27]:
jobs["location"].value_counts()

Central                 491
East                    189
South                   179
West                    162
Islandwide              119
North                    89
East, Central            11
West, Central             4
North, Central            4
South, Central            3
South, East, Central      3
East, West, Central       2
North, East               2
North, West               2
South, East               2
Name: location, dtype: int64

In [30]:

jobs.drop('location', axis=1, inplace=True)



In [31]:
jobs["location_new"].value_counts()

Central       491
East          202
South         187
West          166
Islandwide    119
North          97
Name: location_new, dtype: int64

In [15]:
             
jobs['higher salary'] = (jobs['higher salary'].replace('\$','', regex=True).replace(',','',regex=True).astype(float))

jobs["higher salary"].value_counts()


8000.0      96
6000.0      85
7000.0      77
10000.0     75
9000.0      54
4000.0      51
5000.0      49
3000.0      49
4500.0      42
6500.0      42
15000.0     40
12000.0     40
3500.0      35
2000.0      35
7500.0      30
2500.0      29
11000.0     29
8500.0      28
5500.0      27
6800.0      17
16000.0     17
2800.0      16
13000.0     14
3800.0      13
2200.0      12
2400.0      12
14000.0     11
20000.0     11
1800.0      11
9500.0       8
            ..
130000.0     1
6700.0       1
85000.0      1
65000.0      1
33000.0      1
4850.0       1
240000.0     1
4050.0       1
215000.0     1
60000.0      1
3100.0       1
5950.0       1
8300.0       1
3900.0       1
18300.0      1
164040.0     1
4251.0       1
83160.0      1
21500.0      1
1000.0       1
12750.0      1
7916.0       1
2900.0       1
246500.0     1
15800.0      1
79000.0      1
142320.0     1
4300.0       1
1880.0       1
12600.0      1
Name: higher salary, Length: 149, dtype: int64

In [17]:
                     
jobs['lower salary'] = (jobs['lower salary'].replace('\$','', regex=True).replace(',','', regex=True).astype(float))

jobs["lower salary"].value_counts()

5000.0      174
6000.0      115
3000.0       91
4000.0       88
2000.0       72
8000.0       54
3500.0       54
4500.0       53
2500.0       47
7000.0       43
1800.0       41
5500.0       35
1500.0       24
6500.0       23
10000.0      23
2800.0       20
9000.0       19
3400.0       18
1600.0       15
12000.0      14
2200.0       13
7500.0       12
2400.0       11
2300.0       10
3800.0        9
4100.0        8
3600.0        8
15000.0       8
3200.0        7
8500.0        6
           ... 
173000.0      1
5200.0        1
6900.0        1
4540.0        1
300000.0      1
12600.0       1
6300.0        1
4700.0        1
71160.0       1
132400.0      1
5100.0        1
100000.0      1
7800.0        1
20800.0       1
5830.0        1
10750.0       1
30000.0       1
3900.0        1
7300.0        1
1530.0        1
57000.0       1
143000.0      1
164500.0      1
160000.0      1
900.0         1
3333.0        1
3850.0        1
120000.0      1
13750.0       1
6440.0        1
Name: lower salary, Leng

In [32]:
jobs['Average_salary'] = (jobs['lower salary'] + jobs['higher salary'])/2


jobs.head()

Unnamed: 0,company,title,seniority,category,salary_period,lower salary,higher salary,Average_salary,above median,location_new
0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",Executive,InformationTechnology,Monthly,7700.0,15400.0,11550.0,1,South
2,DBS BANK LTD.,"AVP, Data Scientist, Business Analytics, Consu...",Executive,InformationTechnology,Monthly,6500.0,11700.0,9100.0,1,Central
3,NIOMETRICS (PTE.) LTD.,High-Performance Data Engineer,Executive,InformationTechnology,Monthly,5500.0,11000.0,8250.0,1,Central
4,NIOMETRICS (PTE.) LTD.,Data Scientist,SeniorManagement,InformationTechnology,Monthly,5000.0,10000.0,7500.0,1,East
5,NATIONAL UNIVERSITY OF SINGAPORE,Data Scientist,SeniorManagement,InformationTechnology,Monthly,5000.0,7500.0,6250.0,1,East


In [19]:
median = jobs['Average_salary'].median()
median

5862.5

In [21]:
# defining higher salary as >median and encoding it as 1 and lower salary <= median, and encoding as 0


def above_median(x):
    if x > median:
        return 1
    return 0

jobs['above median'] = jobs['Average_salary'].apply(above_median)
jobs.head()


Unnamed: 0,company,title,location,seniority,category,salary_period,lower salary,higher salary,Average_salary,above median
0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",South,Executive,InformationTechnology,Monthly,7700.0,15400.0,11550.0,1
2,DBS BANK LTD.,"AVP, Data Scientist, Business Analytics, Consu...",Central,Executive,InformationTechnology,Monthly,6500.0,11700.0,9100.0,1
3,NIOMETRICS (PTE.) LTD.,High-Performance Data Engineer,Central,Executive,InformationTechnology,Monthly,5500.0,11000.0,8250.0,1
4,NIOMETRICS (PTE.) LTD.,Data Scientist,East,SeniorManagement,InformationTechnology,Monthly,5000.0,10000.0,7500.0,1
5,NATIONAL UNIVERSITY OF SINGAPORE,Data Scientist,East,SeniorManagement,InformationTechnology,Monthly,5000.0,7500.0,6250.0,1


In [33]:
jobs.tail()

Unnamed: 0,company,title,seniority,category,salary_period,lower salary,higher salary,Average_salary,above median,location_new
1392,AVANADE ASIA PTE LTD,"Senior Analyst, Digital Solution Development",Professional,Engineering,Monthly,4000.0,8000.0,6000.0,1,South
1393,GENIE ESTATE MANAGEMENT,Accounts and Admin Assistant,Professional,Engineering,Monthly,1800.0,2200.0,2000.0,0,South
1394,THE SUPREME HR ADVISORY PTE. LTD.,Aerospace Technician [CAAS / Pneumatic / Tec...,Executive,F&B,Monthly,2000.0,2400.0,2200.0,0,South
1398,CAPGEMINI SINGAPORE PTE. LTD.,Data Engineer,Fresh/entrylevel,InformationTechnology,Monthly,5000.0,7500.0,6250.0,1,Islandwide
1399,CAPGEMINI SINGAPORE PTE. LTD.,Data Manager,Fresh/entrylevel,InformationTechnology,Monthly,5000.0,7500.0,6250.0,1,Islandwide


In [23]:
jobs['above median'].value_counts()


1    631
0    631
Name: above median, dtype: int64

#  modelling

In [34]:
#  I will choose location_new	seniority	category	, as features for classifying salary

# first I need to dummy encode the categorical variable 

jobs = pd.get_dummies(jobs,columns=['location_new','seniority','category'], drop_first=True)
jobs.head(1)

Unnamed: 0,company,title,salary_period,lower salary,higher salary,Average_salary,above median,location_new_East,location_new_Islandwide,location_new_North,...,category_ProfessionalServices,category_Public/CivilService,category_Purchasing/Merchandising,category_RealEstate/PropertyManagement,category_RepairandMaintenance,category_RiskManagement,category_Sales/Retail,category_Sciences/Laboratory/R&D,category_SocialServices,category_Telecommunications
0,GOOGLE ASIA PACIFIC PTE. LTD.,"Data Science Lead, Large Customer Sales - Sing...",Monthly,7700.0,15400.0,11550.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
#  defining X (features)and y(target)
print(jobs.columns.get_loc("above median"))
y = jobs.iloc[:,6].values

X = jobs.iloc[:,7:]
X


6


Unnamed: 0,location_new_East,location_new_Islandwide,location_new_North,location_new_South,location_new_West,seniority_Fresh/entrylevel,seniority_JuniorExecutive,seniority_Manager,seniority_MiddleManagement,seniority_Non-executive,...,category_ProfessionalServices,category_Public/CivilService,category_Purchasing/Merchandising,category_RealEstate/PropertyManagement,category_RepairandMaintenance,category_RiskManagement,category_Sales/Retail,category_Sciences/Laboratory/R&D,category_SocialServices,category_Telecommunications
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [57]:
#  fitting KNN CLssification model

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

# make an instance of a KNeighborsClassifier object with 10 neighbor
knn = KNeighborsClassifier(n_neighbors=10)

# fit on the unstandardized data:
knn.fit(X, y)



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [58]:
# predict the response values for the observations in X ("test the model")
# store the predicted response values
y_pred_class = knn.predict(X)

# compute classification accuracy
from sklearn import metrics

print('accuracy = {}'.format(metrics.accuracy_score(y, y_pred_class)))

accuracy = 0.6101426307448494


In [56]:
from sklearn.model_selection import train_test_split

# STEP 1: split X and y into training and testing sets (using random_state for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, test_size=0.3)

# STEP 2: train the model on the training set (using K=1)
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

# STEP 3: test the model on the testing set, and check the accuracy
y_pred_class = knn.predict(X_test)
print('accuracy = {}'.format(metrics.accuracy_score(y_test, y_pred_class)))


accuracy = 0.44327176781002636


In [65]:

# trying gradient boosting

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, y)
cv_model = cross_val_score(clf, X, y, cv=10)
print ('Cross-validated scores:', cv_model)
print ('Average score:', cv_model.mean())


Cross-validated scores: [0.3984375  0.47619048 0.47619048 0.46825397 0.49206349 0.56349206
 0.56349206 0.47619048 0.44444444 0.47619048]
Average score: 0.4834945436507936


In [67]:
# trying ensemble modelling


model = RandomForestClassifier(n_estimators=100)

cv_model = cross_val_score(model, X, y, cv=6)
print ('Cross-validated scores:', cv_model)
print ('Average score:', cv_model.mean())
model.fit(X, y)





Cross-validated scores: [0.48584906 0.43809524 0.44761905 0.49047619 0.46666667 0.46190476]
Average score: 0.46510182689427976


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
#  trying support vector machine model
from sklearn.svm import SVC
model_svmrbf = SVC(kernel='rbf')
scores_svm = cross_val_score(model_svmrbf, X, y, cv=6)
print ('Cross-validated scores:', scores_svm)
print ('Average score:', scores_svm.mean())
svm_model = model_svmrbf.fit(X, y)

Cross-validated scores: [0.46226415 0.45714286 0.46666667 0.52380952 0.45714286 0.48571429]
Average score: 0.47545672356993113


In [70]:
# trying gradient boosting

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, y)
cv_model = cross_val_score(clf, X, y, cv=6)
print ('Cross-validated scores:', cv_model)
print ('Average score:', cv_model.mean())


Cross-validated scores: [0.44339623 0.47619048 0.51428571 0.57619048 0.4952381  0.48571429]
Average score: 0.498502545672357


In [None]:
#  inference: all my scores are lower than the baseline, which shows I need to perform a better EDA and 
# scrape more relevant data