# Job Market for Data Professionals - Data Cleaning

In [1]:
# Import libraries 
import pandas as pd
import re
import numpy as np

In [2]:
jobs = pd.read_csv('jobs_all_base.csv')
jobs.head()

Unnamed: 0,Title,Location,Company,Salary,Since,Description,Company Url,Industry,Company size
0,Data Scientist,Den Haag,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000
1,Associate Scientist USP,Leiden,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000
2,Cloud Engineer AWS,Amsterdam,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000
3,Web Analist,Amsterdam Centrum,DPG Media,,Net geplaatst,,/cmp/Dpg-Media,,
4,"Senior Full Stack Software Developer (Python, ...",Utrecht Zuidwest,Zero foodwaste,,Net geplaatst,,,,


In [3]:
jobs.shape

(1500, 9)

In [4]:
jobs.dtypes

Title            object
Location         object
Company          object
Salary           object
Since            object
Description     float64
Company Url      object
Industry         object
Company size     object
dtype: object

### Data Cleaning

In [5]:
# Top 15 uncleaned titles
jobs.Title.unique()

array(['Data Scientist', 'Associate Scientist USP', 'Cloud Engineer AWS',
       ..., 'Data Engineer rol met AWS, Azure en/of Google Cloud',
       'Openstaande Support vacature voor de toppers!',
       'Microsoft Data Platform Consultant (BI)'], dtype=object)

In [6]:
og_titles = []

regular_expressions = [r'data[- ]anal[yi]st', r'data[- ]scien', r'data[- ]engineer', r'business[- ]anal', r'business[- ]intelli', r'machine learning[- ]engineer', r'artificial intelligence[- ]engi',r'translat', r'analytics[- ]consul', r'data[- consult]', r'data']

key_words = ["Data Engineer","Data Scientist","Data Analist","Data Analyst","Business Intelligence", "Business Analist", "Business Analyst", "Artificial Intelligence", "Machine Learning", "Data-"]
    
for title in jobs['Title']:
    for key in regular_expressions:
        match = re.search(key, title.lower())
        if match:
            og_titles.append(title)
            
            

    
    

In [7]:
len(og_titles)

922

In [8]:
sorted(og_titles)

['!Uniek!Meehelpen een nieuw Microsoft datawarehouse opzetten?',
 '(Afstudeer)stages Business Intelligence binnen Asset Managem...',
 '(Junior) Data Engineer (m/v)',
 '(Junior) Data Engineer (m/v)',
 '(Junior) Data Engineer (m/v)',
 '(Lead) engineer installatietechniek (E en/of W) datacenters',
 '(Lead) engineer installatietechniek (E en/of W) datacenters',
 '(Senior) Data Engineer',
 '(Senior) Data Engineer',
 '(Senior) Data Engineer',
 '(Senior) Data Scientist Healthcare',
 '(Senior) Data Scientist Healthcare',
 '(Senior) Data Scientist Healthcare',
 'AI & Data Science Internship at ABN AMRO',
 'AI & Data Science Internship at ABN AMRO',
 'AI & Data Science Internship at ABN AMRO',
 'Aantrekkelijke rol voor een Data Analist met BI-tooling',
 'Aantrekkelijke rol voor een Data Analist met BI-tooling',
 'Aantrekkelijke rol voor een Data Analist met BI-tooling',
 'Afstudeerstage Data Scientist beeldherkenning',
 'Afstudeerstage Data Scientist beeldherkenning',
 'Afstudeerstage Data Scien

In [9]:
og_titles = set(og_titles)

In [10]:
# Examine titles that dont' fall the the categories above

jobs[~jobs['Title'].isin(og_titles)]['Title'].value_counts()

Software Engineer                         8
Business Controller                       7
Informatiemanager                         6
DevOps Engineer                           5
Support Engineer                          4
                                         ..
Concerncontroller (32-36) - via K+V       1
Microsoft SSIS (ETL) ontwikkelaar         1
Junior Inkoper                            1
BIM modelleur Werktuigbouw, Rotterdam     1
Policy Advisor Diversity and Inclusion    1
Name: Title, Length: 985, dtype: int64

We will drop the above titles from our analysis, as they don't fall to the traditional categories of data professionals that we want to examine

In [11]:
jobs = jobs[jobs['Title'].isin(og_titles)]
print(jobs.shape)
jobs.head()

(413, 9)


Unnamed: 0,Title,Location,Company,Salary,Since,Description,Company Url,Industry,Company size
0,Data Scientist,Den Haag,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000
10,Director Business Analysis and Planning,Schiphol,Liberty Global,,Vandaag geplaatst,,/cmp/Liberty-Global,,Meer dan 10.000
14,Chapter Lead Data Engineering,Diemen,Tempo-Team,€4.000 - €5.400 per maand,Vandaag geplaatst,,/cmp/Tempo--team,Human resources en personeel,1.001 tot 5.000
15,Database Marketeer,Schiedam,Lefit Recruitment & Interim,,Vandaag geplaatst,,,,
17,Business Analyst with an Asset Management spec...,Utrecht,De Staffing Groep,,Vandaag geplaatst,,/cmp/De-Staffing-Groep,,


In [12]:
# Group data into categories

categories =  ["Data Engineer","Data Scientist", "Data Analyst", "Business Intelligence Dev/Con", "Business Analyst", "Artificial Intelligence", "Machine Learning"]
    
grouped_titles = [] 

for title in jobs['Title']:
    da = re.search(r'\bdata*[\w]+\b.*\banal[yi]st', title.lower())

    ds = re.search(r'data[- ]scien', title.lower())
    
    
    de = re.search(r'\bdata*[\w]+\b.*\bengineer*[\w]+\b.*?$', title.lower())
    ba = re.search(r'business[- \w]+anal|anal[yi]st', title.lower())
    bi = re.search(r'business[- ]intelli|\bbi\b.', title.lower())
    ml = re.search(r'machine learning[- ]engineer', title.lower())
    ai = re.search(r'artiial intelligence[- ]engi', title.lower())
    tr = re.search(r'translat', title.lower())
    cs = re.search(r'consultant', title.lower())
    if da:
        title = "Data Analyst"
    elif ds:
        title = "Data Scientist"
    elif de:
        title = "Data Engineer"
    elif ba:
        title = "Business Analyst"
    elif bi:
        title = "Business Intelligence Consultant/Engineer"
    elif ml:
        title = "Data Engineer"
    elif ai:
        title = "Data Engineer"
    elif tr:
        title = "Translator"
    elif cs:
        title = "Analytics Consultant"
    else:
        title = "Other"
    
    grouped_titles.append(title)
    
        



In [13]:
jobs['Title'].value_counts().head(15)

Data Engineer                                         15
Data Scientist                                        15
Data Analist                                          12
Business Intelligence Specialist                      10
Business Intelligence Consultant                       7
Data analist                                           7
Senior Data Engineer                                   6
Senior Data Scientist                                  4
Data Analyst                                           4
Tech Lead Data Engineering                             3
Business Analist                                       3
Datawarehouse Engineer                                 3
Business Analyst                                       3
Data engineer                                          3
Microsoft Business Intelligence (cloud) Specialist     2
Name: Title, dtype: int64

In [14]:
jobs['Title Grouped'] = grouped_titles

In [15]:
sorted(list(jobs['Title'][jobs['Title Grouped']=='Other']))

['!Uniek!Meehelpen een nieuw Microsoft datawarehouse opzetten?',
 '(Lead) engineer installatietechniek (E en/of W) datacenters',
 'BA Finance/Risk Data',
 'Big Data / Software Traineeship in Amsterdam',
 'Big Data Specialist bij het Spaarne Gasthuis',
 'BigData Engineer met SQL en Python',
 'Business Partner Digital Data',
 'Data Analytics Manager at Customer Support',
 'Data Architect',
 'Data Architect | Cloud | AI | Financiële dienstverlener',
 'Data Architect | Cloud | AI | Financiële dienstverlener | Ro...',
 'Data Controller',
 'Data Governance Officer Tribe Data Management',
 'Data Management Professional (m/f)',
 'Data Partnerships & Outreach Manager, Climate Watch',
 'Data Partnerships & Outreach Manager, Climate Watch',
 'Data Platform Lead',
 'Data Platform Product Owner (Azure)',
 'Data Stewardship Coordinator',
 'Data Subject Matter Expert - Corporate Real Estate',
 'Data Visualization expert',
 'Data and Computer Science Internship',
 'Data architect',
 'Data custodian',


### Industry check-up

In [16]:
jobs.Industry.unique()

array(['Human resources en personeel', nan, 'Transport en vrachtvervoer',
       'Consumentengoederen en dienstverlening',
       'Consultancy en zakelijke dienstverlening',
       'Banken en financiële dienstverlening', 'Detailhandel', 'Overheid',
       'Gezondheidszorg', 'Internet en software', 'Landbouw en mijnbouw',
       'Computers en elektronica', 'Telecommunicatie',
       'OverheidConsultancy en zakelijke dienstverlening',
       'Restaurants, reizen en vrije tijd', 'Farmaceutica',
       'Onroerend goed', 'Energie en nutsbedrijven',
       'Lucht- en ruimtevaartindustrie en defensie',
       'Voedingsmiddelen en dranken',
       'Human resources en personeelConsultancy en zakelijke dienstverlening',
       'Maakindustrie', 'Verzekering'], dtype=object)

In [17]:
jobs[jobs['Industry'].isnull()].groupby('Company').count().sort_values(by='Title', ascending=False)

Unnamed: 0_level_0,Title,Location,Salary,Since,Description,Company Url,Industry,Company size,Title Grouped
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CareerValue,38,38,14,38,0,38,0,0,38
MagnaVersum,9,9,0,9,0,0,0,0,9
Darwin Recruitment,8,8,6,8,0,8,0,8,8
Yacht,6,6,0,6,0,6,0,0,6
My Jewellery,6,6,0,6,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...
KPI Solutions B.V.,1,1,0,1,0,0,0,0,1
Ipreo,1,1,0,1,0,0,0,0,1
International Travel Group,1,1,0,1,0,0,0,0,1
Impact Information Management,1,1,0,1,0,0,0,0,1


### Cleaning salary

In [18]:
min_salary = []
max_salary = []

for sal in jobs['Salary']:
    if type(sal) == str:
        sal = sal.split('-')
        sal_min = re.sub("[^0-9]", "", sal[0])
        if len(sal) == 2:
            sal_max = re.sub("[^0-9]", "", sal[1])
        else:
            sal_max = sal_min
                                               
    else:
        sal_min = 0
        sal_max = 0
    
    min_salary.append(sal_min)
    max_salary.append(sal_max)
    
jobs['Min Salary'] = min_salary
jobs['Max Salary'] = max_salary

jobs.replace(0, np.nan, inplace=True)

jobs['Min Salary'] = jobs['Min Salary'].astype(float)
jobs['Max Salary'] = jobs['Max Salary'].astype(float)



        

        

In [19]:
jobs.head(60)

Unnamed: 0,Title,Location,Company,Salary,Since,Description,Company Url,Industry,Company size,Title Grouped,Min Salary,Max Salary
0,Data Scientist,Den Haag,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000,Data Scientist,,
10,Director Business Analysis and Planning,Schiphol,Liberty Global,,Vandaag geplaatst,,/cmp/Liberty-Global,,Meer dan 10.000,Business Analyst,,
14,Chapter Lead Data Engineering,Diemen,Tempo-Team,€4.000 - €5.400 per maand,Vandaag geplaatst,,/cmp/Tempo--team,Human resources en personeel,1.001 tot 5.000,Data Engineer,4000.0,5400.0
15,Database Marketeer,Schiedam,Lefit Recruitment & Interim,,Vandaag geplaatst,,,,,Other,,
17,Business Analyst with an Asset Management spec...,Utrecht,De Staffing Groep,,Vandaag geplaatst,,/cmp/De-Staffing-Groep,,,Business Analyst,,
20,Junior Data Analist Remediation,Diemen,International Card Services,€38.528 - €58.842 per jaar,Vandaag geplaatst,,/cmp/International-Card-Services,,201 tot 500,Data Analyst,38528.0,58842.0
21,Senior Data Engineer BI domein Commercie - Utr...,Utrecht Binnenstad,NS,€5.968 per maand,Vandaag geplaatst,,/cmp/Ns,Transport en vrachtvervoer,Meer dan 10.000,Data Engineer,5968.0,5968.0
23,Data Analist,Dordrecht,Regio Drechtsteden,€4.450 per maand,Vandaag geplaatst,,,,,Data Analyst,4450.0,4450.0
24,Data Engineer,Amsterdam,True Legends,€3.250 - €4.250 per maand,Vandaag geplaatst,,/cmp/True-Legends,Human resources en personeel,51 tot 200,Data Engineer,3250.0,4250.0
25,Data Scientist,Amsterdam,Darwin Recruitment,,Vandaag geplaatst,,/cmp/Darwin-Recruitment,,51 tot 200,Data Scientist,,


In [20]:
jobs.dtypes


Title             object
Location          object
Company           object
Salary            object
Since             object
Description      float64
Company Url       object
Industry          object
Company size      object
Title Grouped     object
Min Salary       float64
Max Salary       float64
dtype: object

In [21]:
def sal_type(x):
    try:
        if re.search('maand', x):
            saltype = 'Monthly'
        elif re.search('jaar', x):
            saltype = 'Annual'
        elif re.search('uur', x):
            saltype = 'Hourly'
        elif re.search('week', x):
            saltype='Weekly'
        return saltype
    except:
        saltype = 'Not Specified'
 
        return saltype

In [22]:
jobs['Salary Type'] = jobs['Salary'].apply(lambda x: sal_type(x)) 

In [23]:
jobs['Salary Type'].value_counts()

Not Specified    297
Monthly           94
Annual            18
Hourly             3
Weekly             1
Name: Salary Type, dtype: int64

In [24]:
def norm_salary(x):
    salary_min = 0
    salary_max = 0
    if x['Salary Type'] == 'Annual':
        salary_min = x['Min Salary']/12
        salary_max = x['Max Salary']/12
    elif x['Salary Type'] == 'Hourly':
        salary_min = x['Min Salary']*160
        salary_max = x['Min Salary']*160
    elif x['Salary Type'] == 'Monthly':
        salary_min = x['Min Salary']
        salary_max = x['Max Salary']
    elif x['Salary Type'] == 'Weekly':
        salary_min = x['Min Salary']*4
        salary_max = x['Min Salary']*4
    return salary_min, salary_max
        

In [25]:
jobs['Normalized Sal Min'] = jobs.apply(lambda x: norm_salary(x)[0], axis=1)
jobs['Normalized Sal Max'] = jobs.apply(lambda x: norm_salary(x)[1], axis=1)
jobs['Normalized Sal Avg'] = (jobs['Normalized Sal Min']+jobs['Normalized Sal Max'])/2

#### Cleaning Days since posted

In [26]:
jobs.Since.value_counts()

Meer dan 30 dagen geleden    99
1 dag geleden                36
2 dagen geleden              36
9 dagen geleden              30
12 dagen geleden             26
5 dagen geleden              26
7 dagen geleden              25
8 dagen geleden              25
Vandaag geplaatst            23
6 dagen geleden              19
20 dagen geleden             16
13 dagen geleden             11
15 dagen geleden              9
19 dagen geleden              8
14 dagen geleden              7
16 dagen geleden              5
21 dagen geleden              3
11 dagen geleden              2
27 dagen geleden              1
4 dagen geleden               1
17 dagen geleden              1
29 dagen geleden              1
18 dagen geleden              1
22 dagen geleden              1
Net geplaatst                 1
Name: Since, dtype: int64

In [27]:
# 


def posted_days(x):
    if x in ['Vandaag geplaatst', 'Net geplaatst']:
        x = 0
    else:
        x = re.sub('[^0-9]', "", x)
        if x == '':
            x = np.nan
    return x 

def posted(x):
    if x <= 7:
        y='This week'
    elif x <= 20:
        y= '2 weeks ago'
    elif x < 27:
        y= '3 weeks ago'
    else:
        y= 'More than 4 weeks ago'
    return y

jobs['Posted_days'] = jobs['Since'].apply(lambda x: posted_days(x))
jobs['Posted_days'] = jobs['Posted_days'].astype(float)
jobs['Posted'] = jobs['Posted_days'].apply(lambda x: posted(x))


In [28]:
jobs.head(60)

Unnamed: 0,Title,Location,Company,Salary,Since,Description,Company Url,Industry,Company size,Title Grouped,Min Salary,Max Salary,Salary Type,Normalized Sal Min,Normalized Sal Max,Normalized Sal Avg,Posted_days,Posted
0,Data Scientist,Den Haag,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000,Data Scientist,,,Not Specified,0.0,0.0,0.0,0.0,This week
10,Director Business Analysis and Planning,Schiphol,Liberty Global,,Vandaag geplaatst,,/cmp/Liberty-Global,,Meer dan 10.000,Business Analyst,,,Not Specified,0.0,0.0,0.0,0.0,This week
14,Chapter Lead Data Engineering,Diemen,Tempo-Team,€4.000 - €5.400 per maand,Vandaag geplaatst,,/cmp/Tempo--team,Human resources en personeel,1.001 tot 5.000,Data Engineer,4000.0,5400.0,Monthly,4000.0,5400.0,4700.0,0.0,This week
15,Database Marketeer,Schiedam,Lefit Recruitment & Interim,,Vandaag geplaatst,,,,,Other,,,Not Specified,0.0,0.0,0.0,0.0,This week
17,Business Analyst with an Asset Management spec...,Utrecht,De Staffing Groep,,Vandaag geplaatst,,/cmp/De-Staffing-Groep,,,Business Analyst,,,Not Specified,0.0,0.0,0.0,0.0,This week
20,Junior Data Analist Remediation,Diemen,International Card Services,€38.528 - €58.842 per jaar,Vandaag geplaatst,,/cmp/International-Card-Services,,201 tot 500,Data Analyst,38528.0,58842.0,Annual,3210.666667,4903.5,4057.083333,0.0,This week
21,Senior Data Engineer BI domein Commercie - Utr...,Utrecht Binnenstad,NS,€5.968 per maand,Vandaag geplaatst,,/cmp/Ns,Transport en vrachtvervoer,Meer dan 10.000,Data Engineer,5968.0,5968.0,Monthly,5968.0,5968.0,5968.0,0.0,This week
23,Data Analist,Dordrecht,Regio Drechtsteden,€4.450 per maand,Vandaag geplaatst,,,,,Data Analyst,4450.0,4450.0,Monthly,4450.0,4450.0,4450.0,0.0,This week
24,Data Engineer,Amsterdam,True Legends,€3.250 - €4.250 per maand,Vandaag geplaatst,,/cmp/True-Legends,Human resources en personeel,51 tot 200,Data Engineer,3250.0,4250.0,Monthly,3250.0,4250.0,3750.0,0.0,This week
25,Data Scientist,Amsterdam,Darwin Recruitment,,Vandaag geplaatst,,/cmp/Darwin-Recruitment,,51 tot 200,Data Scientist,,,Not Specified,0.0,0.0,0.0,0.0,This week


## Cleaning Locations

In [29]:
sorted_list = sorted(jobs['Location'].unique())
print(sorted_list)

['Alkmaar', 'Almere', 'Amersfoort', 'Amstelveen', 'Amsterdam', 'Amsterdam Centrum', 'Amsterdam Nieuw-West', 'Amsterdam West', 'Amsterdam Zuid', 'Amsterdam-Zuidoost', 'Apeldoorn', 'Arnhem', 'Baarn', 'Born', 'Breda', 'Breukelen', 'Bronkhorst', 'Capelle aan den IJssel', 'Culemborg', 'Delft', 'Den Bosch', 'Den Haag', 'Den Haag Centrum', 'Deventer', 'Diemen', 'Doesburg', 'Dordrecht', 'Driebergen', 'Driebergen-Rijsenburg', 'Eindhoven', 'Enschede', 'Fijnaart', 'Gemeente Hardinxveld-Giessendam', 'Gooi', 'Groningen', 'Harderwijk', 'Heerlen', 'Hengelo OV', 'Hilversum', 'Hoofddorp', 'Houten', 'Huizen', 'IJsselstein', 'Kampen', 'Laren', 'Leeuwarden', 'Leiden', 'Leusden', 'Maastricht', 'Made', 'Middelburg', 'Naarden', 'Nederland', 'Nieuw-Vennep', 'Nijmegen', 'Oirschot', 'Oss', 'Raalte', 'Randstad', 'Renswoude', 'Rijssen', 'Rijswijk', 'Rotterdam', 'Rotterdam Centrum', 'Rotterdam Feijenoord', 'Rotterdam Maasvlakte', 'Rotterdam Overschie', 'Schiedam', 'Schiphol', 'Slochteren', 'Tiel', 'Tilburg', 'Twen

In [30]:
jobs['Location'].value_counts().sum()

413

In [31]:
big_cities = ['amsterdam', 'utrecht', 'den haag', 'rotterdam']


In [32]:
def city_clean(x):
    x = x.replace('-', ' ')
    for city in big_cities:
        if re.search(city, x.lower()):
            x = x.split(' ')[0]
            if x == 'Den':
                x = 'Den Haag'
    return x

jobs['Location Grouped'] = jobs['Location'].apply(lambda x: city_clean(x))

In [33]:
jobs.Location.value_counts()

Amsterdam                87
Den Bosch                28
Utrecht                  25
Rotterdam                20
Den Haag                 20
                         ..
Driebergen-Rijsenburg     1
Werk van thuis            1
IJsselstein               1
Waalwijk                  1
Naarden                   1
Name: Location, Length: 91, dtype: int64

Final check of the data

In [34]:
jobs.head()

Unnamed: 0,Title,Location,Company,Salary,Since,Description,Company Url,Industry,Company size,Title Grouped,Min Salary,Max Salary,Salary Type,Normalized Sal Min,Normalized Sal Max,Normalized Sal Avg,Posted_days,Posted,Location Grouped
0,Data Scientist,Den Haag,HAYS,,Net geplaatst,,/cmp/Hays,Human resources en personeel,5.001 tot 10.000,Data Scientist,,,Not Specified,0.0,0.0,0.0,0.0,This week,Den Haag
10,Director Business Analysis and Planning,Schiphol,Liberty Global,,Vandaag geplaatst,,/cmp/Liberty-Global,,Meer dan 10.000,Business Analyst,,,Not Specified,0.0,0.0,0.0,0.0,This week,Schiphol
14,Chapter Lead Data Engineering,Diemen,Tempo-Team,€4.000 - €5.400 per maand,Vandaag geplaatst,,/cmp/Tempo--team,Human resources en personeel,1.001 tot 5.000,Data Engineer,4000.0,5400.0,Monthly,4000.0,5400.0,4700.0,0.0,This week,Diemen
15,Database Marketeer,Schiedam,Lefit Recruitment & Interim,,Vandaag geplaatst,,,,,Other,,,Not Specified,0.0,0.0,0.0,0.0,This week,Schiedam
17,Business Analyst with an Asset Management spec...,Utrecht,De Staffing Groep,,Vandaag geplaatst,,/cmp/De-Staffing-Groep,,,Business Analyst,,,Not Specified,0.0,0.0,0.0,0.0,This week,Utrecht


In [35]:
jobs.drop(columns={'Description', 'Company Url'}, inplace=True)

#### Cleaning Industries


In [36]:
jobs[jobs['Industry'].isnull()].groupby('Company').count().sort_values(by='Title', ascending=False)

Unnamed: 0_level_0,Title,Location,Salary,Since,Industry,Company size,Title Grouped,Min Salary,Max Salary,Salary Type,Normalized Sal Min,Normalized Sal Max,Normalized Sal Avg,Posted_days,Posted,Location Grouped
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
CareerValue,38,38,14,38,0,0,38,14,14,38,38,38,38,38,38,38
MagnaVersum,9,9,0,9,0,0,9,0,0,9,9,9,9,9,9,9
Darwin Recruitment,8,8,6,8,0,8,8,6,6,8,8,8,8,8,8,8
Yacht,6,6,0,6,0,0,6,0,0,6,6,6,6,6,6,6
My Jewellery,6,6,0,6,0,0,6,0,0,6,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KPI Solutions B.V.,1,1,0,1,0,0,1,0,0,1,1,1,1,1,1,1
Ipreo,1,1,0,1,0,0,1,0,0,1,1,1,1,1,1,1
International Travel Group,1,1,0,1,0,0,1,0,0,1,1,1,1,1,1,1
Impact Information Management,1,1,0,1,0,0,1,0,0,1,1,1,1,1,1,1


In [37]:
jobs.to_csv('jobs_cleaned_final.csv', index=False)

In [38]:
# import required libraries to connet to MySQL Database
import pymysql 
from sqlalchemy import create_engine  


In [39]:
engine = create_engine('mysql+pymysql://root:xxxxx@localhost:3306/data_jobs_nl')


In [40]:
jobs.to_sql('jobs_last_30', engine, if_exists='replace', index=False)
