# Denormalization Process for Job Posting Collection

In [7]:
import pandas as pd

# Load the datasets
file_paths = {
    'benefits': 'cleaned_aggregated_benefits.csv',
    'job_skills': 'cleaned_aggregated_job_skills.csv',
    'job_industries': 'cleaned_job_industries.csv',
    'job_postings': 'cleaned_job_postings.csv',
    'salaries': 'cleaned_salaries.csv'
}

# Read the datasets into DataFrames
dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Display the first few rows of each DataFrame to understand their structure
for name, df in dataframes.items():
    print(f"First few rows of {name} dataset:")
    display(df.head())
    print("\n")

First few rows of benefits dataset:


Unnamed: 0,job_id,type
0,3958427,['Medical insurance']
1,85008768,"['Medical insurance', 'Vision insurance', 'Den..."
2,133114754,"['Medical insurance', '401(k)', 'Vision insura..."
3,529257371,"['Medical insurance', 'Vision insurance', 'Den..."
4,967848246,['401(k)']




First few rows of job_skills dataset:


Unnamed: 0,job_id,skill_abr
0,3958427,"['DSGN', 'ART', 'IT']"
1,85008768,"['SALE', 'BD']"
2,102339515,"['BD', 'SALE']"
3,108965123,['ADM']
4,133114754,"['SALE', 'BD']"




First few rows of job_industries dataset:


Unnamed: 0,job_id,industry_id
0,3378133231,68
1,3497509795,96
2,3690843087,47
3,3691775263,112
4,3691779379,80




First few rows of job_postings dataset:


Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,application_type,expiry,formatted_experience_level,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped
0,3757940104,553718,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.0,,MONTHLY,Full-time,"Little River, SC",...,OffsiteApply,1701680000000.0,Entry level,1699090000000.0,careers-demant.icims.com,0,FULL_TIME,USD,BASE_SALARY,1699138101
1,3757940025,2192142,Shipping & Receiving Associate 2nd shift (Beav...,Metalcraft of Mayville\nMetalcraft of Mayville...,,,,,Full-time,"Beaver Dam, WI",...,OffsiteApply,1701680000000.0,,1699080000000.0,www.click2apply.net,0,FULL_TIME,,,1699085420
2,3757938019,474443,"Manager, Engineering",\nThe TSUBAKI name is synonymous with excellen...,,,,,Full-time,"Bessemer, AL",...,OffsiteApply,1701680000000.0,,1699080000000.0,www.click2apply.net,0,FULL_TIME,,,1699085644
3,3757938018,18213359,Cook,descriptionTitle\n\n Looking for a great oppor...,,22.27,,HOURLY,Full-time,"Aliso Viejo, CA",...,OffsiteApply,1701680000000.0,Entry level,1699080000000.0,jobs.apploi.com,0,FULL_TIME,USD,BASE_SALARY,1699087461
4,3757937095,437225,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",275834.0,,205956.0,YEARLY,Full-time,United States,...,OffsiteApply,1701680000000.0,Mid-Senior level,1699090000000.0,careers.iherb.com,0,FULL_TIME,USD,BASE_SALARY,1699085346




First few rows of salaries dataset:


Unnamed: 0,salary_id,job_id,max_salary,med_salary,min_salary,pay_period,currency,compensation_type
0,1,3378133231,30.0,,22.0,HOURLY,USD,BASE_SALARY
1,2,3690843087,65000.0,,55000.0,YEARLY,USD,BASE_SALARY
2,3,3691794313,22.0,,19.0,HOURLY,USD,BASE_SALARY
3,4,3691795389,70000.0,,68000.0,YEARLY,USD,BASE_SALARY
4,5,3691797089,22.0,,18.0,HOURLY,USD,BASE_SALARY






The datasets have been loaded successfully. Here's a brief overview of their contents:

Benefits Dataset: Contains columns related to job benefits. Key columns include job_id and various types of benefits.
Job Skills Dataset: Includes job_id and details about required job skills.
Job Industries Dataset: Lists job_id along with the associated industries.
Job Postings Dataset: This seems to be the central dataset, with job_id as a primary key and details like title, description, company, etc.
Salaries Dataset: Contains job_id and salary information.
    
The job_id column appears to be the common link across these datasets. To denormalize, we'll join these tables on the job_id column.

In [8]:
# Perform an inner join on the 'job_id' column to denormalize the datasets
denormalized_df = dataframes['job_postings']

for name, df in dataframes.items():
    if name != 'job_postings':  # Already included job_postings as the base
        denormalized_df = denormalized_df.merge(df, on='job_id', how='inner')

denormalized_df.head()

Unnamed: 0,job_id,company_id,title,description,max_salary_x,med_salary_x,min_salary_x,pay_period_x,formatted_work_type,location,...,type,skill_abr,industry_id,salary_id,max_salary_y,med_salary_y,min_salary_y,pay_period_y,currency_y,compensation_type_y
0,3757940104,553718,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.0,,MONTHLY,Full-time,"Little River, SC",...,"['Medical insurance', 'Disability insurance']",['OTHR'],17,13493,,5250.0,,MONTHLY,USD,BASE_SALARY
1,3757935381,19181907,Insights Analyst - Auto Industry,Who We Are\n\nEscalent is an award-winning dat...,64000.0,,58000.0,YEARLY,Full-time,United States,...,"['401(k)', 'Vision insurance']","['RSCH', 'ANLS', 'IT']",97,12965,64000.0,,58000.0,YEARLY,USD,BASE_SALARY
2,3757934347,10515052,Body Technician,Company: Gerber Collision & Glass\n\nWELCOME T...,30.0,,20.0,HOURLY,Full-time,"Waukesha, WI",...,"['Vision insurance', '401(k)', 'Disability ins...","['ENG', 'IT']",3198,12138,30.0,,20.0,HOURLY,USD,BASE_SALARY
3,3757934318,6577380,CDL Class B Driver,Hi there! We are looking for punctual drivers ...,27.0,,25.0,HOURLY,Full-time,"Oakland, CA",...,"['Dental insurance', 'Disability insurance', '...","['MGMT', 'MNFC']",92,13064,27.0,,25.0,HOURLY,USD,BASE_SALARY
4,3757934267,28352,CDL A Delivery Driver,Company Description\n\nPerformance Foodservice...,105000.0,,85000.0,YEARLY,Full-time,"Swedesboro, NJ",...,['401(k)'],"['MGMT', 'MNFC']",133,12112,105000.0,,85000.0,YEARLY,USD,BASE_SALARY


In [9]:
# Getting the number of rows and columns in the denormalized dataset
num_rows, num_columns = denormalized_df.shape
num_rows, num_columns

(7558, 36)

The resulting DataFrame contains a combination of columns from all the original datasets, merged on the job_id column.

In [10]:
# Listing the column names of the denormalized dataset
column_names = denormalized_df.columns.tolist()
column_names

['job_id',
 'company_id',
 'title',
 'description',
 'max_salary_x',
 'med_salary_x',
 'min_salary_x',
 'pay_period_x',
 'formatted_work_type',
 'location',
 'applies',
 'original_listed_time',
 'remote_allowed',
 'views',
 'job_posting_url',
 'application_url',
 'application_type',
 'expiry',
 'formatted_experience_level',
 'listed_time',
 'posting_domain',
 'sponsored',
 'work_type',
 'currency_x',
 'compensation_type_x',
 'scraped',
 'type',
 'skill_abr',
 'industry_id',
 'salary_id',
 'max_salary_y',
 'med_salary_y',
 'min_salary_y',
 'pay_period_y',
 'currency_y',
 'compensation_type_y']

In [11]:
# Removing the 'pay_period_y', 'currency_y', and 'compensation_type_y' fields
fields_to_remove = ['pay_period_y', 'currency_y', 'compensation_type_y']
denormalized_df.drop(columns=fields_to_remove, inplace=True)

# Displaying the updated DataFrame
denormalized_df.head()

Unnamed: 0,job_id,company_id,title,description,max_salary_x,med_salary_x,min_salary_x,pay_period_x,formatted_work_type,location,...,currency_x,compensation_type_x,scraped,type,skill_abr,industry_id,salary_id,max_salary_y,med_salary_y,min_salary_y
0,3757940104,553718,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.0,,MONTHLY,Full-time,"Little River, SC",...,USD,BASE_SALARY,1699138101,"['Medical insurance', 'Disability insurance']",['OTHR'],17,13493,,5250.0,
1,3757935381,19181907,Insights Analyst - Auto Industry,Who We Are\n\nEscalent is an award-winning dat...,64000.0,,58000.0,YEARLY,Full-time,United States,...,USD,BASE_SALARY,1699134111,"['401(k)', 'Vision insurance']","['RSCH', 'ANLS', 'IT']",97,12965,64000.0,,58000.0
2,3757934347,10515052,Body Technician,Company: Gerber Collision & Glass\n\nWELCOME T...,30.0,,20.0,HOURLY,Full-time,"Waukesha, WI",...,USD,BASE_SALARY,1699088430,"['Vision insurance', '401(k)', 'Disability ins...","['ENG', 'IT']",3198,12138,30.0,,20.0
3,3757934318,6577380,CDL Class B Driver,Hi there! We are looking for punctual drivers ...,27.0,,25.0,HOURLY,Full-time,"Oakland, CA",...,USD,BASE_SALARY,1699134861,"['Dental insurance', 'Disability insurance', '...","['MGMT', 'MNFC']",92,13064,27.0,,25.0
4,3757934267,28352,CDL A Delivery Driver,Company Description\n\nPerformance Foodservice...,105000.0,,85000.0,YEARLY,Full-time,"Swedesboro, NJ",...,USD,BASE_SALARY,1699088206,['401(k)'],"['MGMT', 'MNFC']",133,12112,105000.0,,85000.0


In [12]:
# Listing the current columns in the denormalized dataset
current_column_names = denormalized_df.columns.tolist()
current_column_names

['job_id',
 'company_id',
 'title',
 'description',
 'max_salary_x',
 'med_salary_x',
 'min_salary_x',
 'pay_period_x',
 'formatted_work_type',
 'location',
 'applies',
 'original_listed_time',
 'remote_allowed',
 'views',
 'job_posting_url',
 'application_url',
 'application_type',
 'expiry',
 'formatted_experience_level',
 'listed_time',
 'posting_domain',
 'sponsored',
 'work_type',
 'currency_x',
 'compensation_type_x',
 'scraped',
 'type',
 'skill_abr',
 'industry_id',
 'salary_id',
 'max_salary_y',
 'med_salary_y',
 'min_salary_y']

In [13]:
# Renaming the 'type' field to 'benefits' and 'skill_abr' field to 'skills'
denormalized_df.rename(columns={'type': 'benefits', 'skill_abr': 'skills'}, inplace=True)

# Verifying the changes
denormalized_df.head()

Unnamed: 0,job_id,company_id,title,description,max_salary_x,med_salary_x,min_salary_x,pay_period_x,formatted_work_type,location,...,currency_x,compensation_type_x,scraped,benefits,skills,industry_id,salary_id,max_salary_y,med_salary_y,min_salary_y
0,3757940104,553718,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.0,,MONTHLY,Full-time,"Little River, SC",...,USD,BASE_SALARY,1699138101,"['Medical insurance', 'Disability insurance']",['OTHR'],17,13493,,5250.0,
1,3757935381,19181907,Insights Analyst - Auto Industry,Who We Are\n\nEscalent is an award-winning dat...,64000.0,,58000.0,YEARLY,Full-time,United States,...,USD,BASE_SALARY,1699134111,"['401(k)', 'Vision insurance']","['RSCH', 'ANLS', 'IT']",97,12965,64000.0,,58000.0
2,3757934347,10515052,Body Technician,Company: Gerber Collision & Glass\n\nWELCOME T...,30.0,,20.0,HOURLY,Full-time,"Waukesha, WI",...,USD,BASE_SALARY,1699088430,"['Vision insurance', '401(k)', 'Disability ins...","['ENG', 'IT']",3198,12138,30.0,,20.0
3,3757934318,6577380,CDL Class B Driver,Hi there! We are looking for punctual drivers ...,27.0,,25.0,HOURLY,Full-time,"Oakland, CA",...,USD,BASE_SALARY,1699134861,"['Dental insurance', 'Disability insurance', '...","['MGMT', 'MNFC']",92,13064,27.0,,25.0
4,3757934267,28352,CDL A Delivery Driver,Company Description\n\nPerformance Foodservice...,105000.0,,85000.0,YEARLY,Full-time,"Swedesboro, NJ",...,USD,BASE_SALARY,1699088206,['401(k)'],"['MGMT', 'MNFC']",133,12112,105000.0,,85000.0


In [14]:
# Listing the current column names after renaming
current_column_names_after_renaming = denormalized_df.columns.tolist()
current_column_names_after_renaming

['job_id',
 'company_id',
 'title',
 'description',
 'max_salary_x',
 'med_salary_x',
 'min_salary_x',
 'pay_period_x',
 'formatted_work_type',
 'location',
 'applies',
 'original_listed_time',
 'remote_allowed',
 'views',
 'job_posting_url',
 'application_url',
 'application_type',
 'expiry',
 'formatted_experience_level',
 'listed_time',
 'posting_domain',
 'sponsored',
 'work_type',
 'currency_x',
 'compensation_type_x',
 'scraped',
 'benefits',
 'skills',
 'industry_id',
 'salary_id',
 'max_salary_y',
 'med_salary_y',
 'min_salary_y']

In [18]:
# Saving the denormalized dataset to a CSV file
denormalized_csv_path = 'denormalized_job_data.csv'
denormalized_df.to_csv(denormalized_csv_path, index=False)

denormalized_csv_path

'denormalized_job_data.csv'

In [20]:
import re

# Function to check if a string contains an emoji
def contains_emoji(s):
    # Emoji ranges: https://www.unicode.org/Public/emoji/13.0/emoji-sequences.txt
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "\U0001F700-\U0001F77F"  # alchemical symbols
                               "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               "\U0001FA00-\U0001FA6F"  # Chess Symbols
                               "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               "\U00002702-\U000027B0"  # Dingbats
                               "\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return bool(emoji_pattern.search(s))

# Check for emojis in each column of the dataset
emoji_presence = {column: any(denormalized_df[column].astype(str).apply(contains_emoji)) 
                  for column in denormalized_df.columns}

emoji_presence


{'job_id': False,
 'company_id': False,
 'title': True,
 'description': True,
 'max_salary_x': False,
 'med_salary_x': False,
 'min_salary_x': False,
 'pay_period_x': False,
 'formatted_work_type': False,
 'location': False,
 'applies': False,
 'original_listed_time': False,
 'remote_allowed': False,
 'views': False,
 'job_posting_url': False,
 'application_url': False,
 'application_type': False,
 'expiry': False,
 'formatted_experience_level': False,
 'listed_time': False,
 'posting_domain': False,
 'sponsored': False,
 'work_type': False,
 'currency_x': False,
 'compensation_type_x': False,
 'scraped': False,
 'benefits': False,
 'skills': False,
 'industry_id': False,
 'salary_id': False,
 'max_salary_y': False,
 'med_salary_y': False,
 'min_salary_y': False}

In [23]:
# Saving the updated dataset (with emojis removed) to a new CSV file
updated_denormalized_csv_path = 'updated_denormalized_job_data.csv'
denormalized_df.to_csv(updated_denormalized_csv_path, index=False)

updated_denormalized_csv_path

'updated_denormalized_job_data.csv'

In [25]:
denormalized_df.shape

(7558, 33)

The denormalized dataset contains 7,558 rows and 36 columns.