In [1]:
import pandas as pd
import numpy as np
import scipy as sp

import os
import re
from pprint import pprint
from collections import Counter


## Data Set

In [5]:
# Job Skill
df_job_skills = pd.read_csv("job_skills.csv")
df_job_skills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296381 entries, 0 to 1296380
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   job_link    1296381 non-null  object
 1   job_skills  1294346 non-null  object
dtypes: object(2)
memory usage: 19.8+ MB


In [None]:
# Job Summary
df_job_summary = pd.read_csv("job_summary.csv")
df_job_summary.info()

In [6]:
# Job posting
df_job_posting = pd.read_csv("linkedin_job_postings.csv")

df_job_posting['got_summary'] = df_job_posting['got_summary'].map(lambda x: True if x=='t' else False)
df_job_posting['got_ner'] = df_job_posting['got_ner'].map(lambda x: True if x=='t' else False)
df_job_posting['is_being_worked'] = df_job_posting['is_being_worked'].map(lambda x: True if x=='t' else False)

df_job_posting.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348454 entries, 0 to 1348453
Data columns (total 14 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   job_link             1348454 non-null  object
 1   last_processed_time  1348454 non-null  object
 2   got_summary          1348454 non-null  bool  
 3   got_ner              1348454 non-null  bool  
 4   is_being_worked      1348454 non-null  bool  
 5   job_title            1348454 non-null  object
 6   company              1348443 non-null  object
 7   job_location         1348435 non-null  object
 8   first_seen           1348454 non-null  object
 9   search_city          1348454 non-null  object
 10  search_country       1348454 non-null  object
 11  search_position      1348454 non-null  object
 12  job_level            1348454 non-null  object
 13  job_type             1348454 non-null  object
dtypes: bool(3), object(11)
memory usage: 117.0+ MB


## Data Cleaning

In [26]:
# df_summary['job_link_cleaned'] = df_summary['job_link'].map(lambda x: x.split('-')[-1])
df_job_skills['job_link_cleaned'] = df_job_skills['job_link'].map(lambda x: x.split('-')[-1])
df_job_posting['job_link_cleaned'] = df_job_posting['job_link'].map(lambda x: x.split('-')[-1])


In [27]:
df_job_skills['job_link_cleaned'].value_counts()

3797940160    4
3799818268    4
3802713233    3
3801752291    3
3804693163    3
             ..
3787531011    1
3775036792    1
3774243908    1
3789232559    1
3805285228    1
Name: job_link_cleaned, Length: 1294972, dtype: int64

In [28]:
df_job_posting['job_link_cleaned'].value_counts()

jobs          664
3797940160      5
3804421891      4
3799818270      4
3799818268      4
             ... 
3802284356      1
3640175459      1
3795536727      1
3770914813      1
3734494804      1
Name: job_link_cleaned, Length: 1346169, dtype: int64

In [29]:
df_job_skills = df_job_skills.drop_duplicates(['job_link_cleaned'], keep='first')
df_job_posting = df_job_posting.drop_duplicates(['job_link_cleaned'], keep='first')

In [30]:
print(df_job_skills.shape)
print(df_job_posting.shape)

(1294972, 3)
(1346169, 15)


## Merge

In [31]:
# df = df_summary.merge(df_job_skills, on="job_link_cleaned", how="inner")
df = df_job_posting.merge(df_job_skills, on="job_link_cleaned", how="inner")

df = df.drop(columns = ['job_link_x', 'job_link_y'])

df = df[df['job_skills'].notnull()]

df = df.reset_index(drop=True)

print(df.shape)
df.head()

(1292937, 15)


Unnamed: 0,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_link_cleaned,job_skills
0,2024-01-21 07:12:29.00256+00,True,True,False,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite,3802078767,"Medical equipment sales, Key competitors, Term..."
1,2024-01-21 07:39:58.88137+00,True,True,False,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite,3803386312,"Nursing, Bachelor of Science in Nursing, Maste..."
2,2024-01-21 07:40:00.251126+00,True,True,False,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite,3771464419,"Restaurant Operations Management, Inventory Ma..."
3,2024-01-21 07:40:00.308133+00,True,True,False,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite,3797661348,"Real Estate, Customer Service, Sales, Negotiat..."
4,2024-01-21 08:08:19.663033+00,True,True,False,Registered Nurse (RN),Trinity Health MI,"Muskegon, MI",2024-01-14,Muskegon,United States,Nurse Practitioner,Mid senior,Onsite,3790954711,"Nursing, BSN, Medical License, Virtual RN, Nur..."


In [32]:
# Reordering the columns with "job_link_cleaned" as the first column
df = df.reindex(columns=['job_link_cleaned', 'last_processed_time', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type', 'job_skills'])

print(df.shape)
df.head()

(1292937, 15)


Unnamed: 0,job_link_cleaned,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,3802078767,2024-01-21 07:12:29.00256+00,True,True,False,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite,"Medical equipment sales, Key competitors, Term..."
1,3803386312,2024-01-21 07:39:58.88137+00,True,True,False,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite,"Nursing, Bachelor of Science in Nursing, Maste..."
2,3771464419,2024-01-21 07:40:00.251126+00,True,True,False,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite,"Restaurant Operations Management, Inventory Ma..."
3,3797661348,2024-01-21 07:40:00.308133+00,True,True,False,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite,"Real Estate, Customer Service, Sales, Negotiat..."
4,3790954711,2024-01-21 08:08:19.663033+00,True,True,False,Registered Nurse (RN),Trinity Health MI,"Muskegon, MI",2024-01-14,Muskegon,United States,Nurse Practitioner,Mid senior,Onsite,"Nursing, BSN, Medical License, Virtual RN, Nur..."


# Basic Data Cleaning

Perform basic data cleaning to remove unicode characters and Markdown that might cause issues when exporting

In [34]:
# Function to clean strings
def clean_text(text):

    if pd.isna(text):
        return ''
        
    # Remove extra spaces and tabs
    cleaned_text = re.sub(r'\s+', ' ', text)

    # Remove Unicode characters and Markdown
    cleaned_text = re.sub(r'[^\x00-\x7F\\]+', ' ', cleaned_text)

    return cleaned_text.strip()

# List of columns to clean (string type)
columns_to_clean = ['job_title', 'company', 'job_location', 'search_city', 'search_country',
                    'search_position', 'job_level', 'job_type', 'job_skills']

# Clean the specified columns
for col in columns_to_clean:
    df[col] = df[col].apply(clean_text)

# Now 'df' contains the cleaned columns
# Reset index
df.reset_index(drop=True, inplace=True)

In [35]:
# Export the DataFrame to a CSV file
df.to_csv('linkedin_job_posts_skills.csv', index=True, index_label='sn')  # also export the index

# This CSV file will be imported into a Spark dataframe to build the job recommender (seperate notebook)

In [36]:
datata = pd.read_csv('linkedin_job_posts_skills.csv')

In [37]:
datata

Unnamed: 0,sn,job_link_cleaned,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,0,3802078767,2024-01-21 07:12:29.00256+00,True,True,False,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite,"Medical equipment sales, Key competitors, Term..."
1,1,3803386312,2024-01-21 07:39:58.88137+00,True,True,False,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite,"Nursing, Bachelor of Science in Nursing, Maste..."
2,2,3771464419,2024-01-21 07:40:00.251126+00,True,True,False,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite,"Restaurant Operations Management, Inventory Ma..."
3,3,3797661348,2024-01-21 07:40:00.308133+00,True,True,False,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite,"Real Estate, Customer Service, Sales, Negotiat..."
4,4,3790954711,2024-01-21 08:08:19.663033+00,True,True,False,Registered Nurse (RN),Trinity Health MI,"Muskegon, MI",2024-01-14,Muskegon,United States,Nurse Practitioner,Mid senior,Onsite,"Nursing, BSN, Medical License, Virtual RN, Nur..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1292932,1292932,3798038316,2024-01-20 15:21:07.786118+00,True,True,False,Registered Nurse (RN) #CP-RN-7998660 - 2411627...,TravelNurseSource,"Providence, RI",2024-01-14,Fall River,United States,Nurse Supervisor,Mid senior,Onsite,"Registered Nurse, BLS certification, Nursing c..."
1292933,1292933,3805151489,2024-01-20 15:21:10.885264+00,True,True,False,Construction Superintendent,Jobot,"New Iberia, LA",2024-01-15,Lafayette,United States,Assistant Construction Superintendent,Mid senior,Onsite,"Construction management, Project planning, Est..."
1292934,1292934,3739779610,2024-01-21 07:40:00.304641+00,True,True,False,"Executive Chef, Operations Support",NEXDINE Hospitality,"Riverhead, NY",2024-01-14,Eastport,United States,Chef,Mid senior,Onsite,"Culinary, Chef Director, Menu writing, Cycle o..."
1292935,1292935,3802207476,2024-01-21 00:38:39.816821+00,True,True,False,"RN- Registered Nurse, Analyst - - 23934913EXPP...",TravelNurseSource,"Aurora, CO",2024-01-16,Colorado,United States,Occupational Analyst,Mid senior,Onsite,"Registered Nurse, Analyst, RN Registered Nurse..."
