## **PROJECT - NOTEBOOK #4: Merge LinkedIn and USAJOBS Data**

---

### **Setting Environment**

In [1]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from datetime import datetime
import re
import json
from urllib.request import urlopen

In [2]:
print(os.getcwd())

try:
    os.chdir("../../project_etl")
except FileNotFoundError:
    print("""
        FileNotFoundError - The directory may not exist or you might not be in the specified path.
        If this has already worked, do not run this block again, as the current directory is already set to project_etl.
        """)
    
print(os.getcwd())

d:\U\FIFTH SEMESTER\ETL\project_etl\notebooks
d:\U\FIFTH SEMESTER\ETL\project_etl


### **Importing Data**

In [3]:
#engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

In [4]:
#df_linkedin = pd.read_sql_table('merge', schema='dimensional_model', con=engine)
df_linkedin = pd.read_csv('data_merged/merge.csv')

In [5]:
usajobs_file = [f for f in os.listdir('.') if f.startswith('usajobs_data_') and f.endswith('.csv')][0]
df_usajobs = pd.read_csv(usajobs_file)

In [6]:
print("LinkedIn Data Info:")
print(df_linkedin.info())
print("\nUSAJOBS Data Info:")
print(df_usajobs.info())

LinkedIn Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                123849 non-null  object 
 2   company_id                  123849 non-null  int64  
 3   views                       122160 non-null  float64
 4   formatted_work_type         123849 non-null  object 
 5   applies                     123849 non-null  int64  
 6   remote_allowed              123849 non-null  int64  
 7   application_type            123849 non-null  object 
 8   formatted_experience_level  123849 non-null  object 
 9   normalized_salary           36073 non-null   float64
 10  len_description             123842 non-null  float64
 11  state_only                  123849 non-null  object 
 12  original_listed_month       123849 non-null  object 

### **Data Preprocessing**

In [7]:
print("\nUnique values of 'state_only' in df_linkedin before processing:")
print(df_linkedin['state_only'].unique())


Unique values of 'state_only' in df_linkedin before processing:
['nj' 'co' 'oh' 'ny' 'ia' 'nc' 'other' 'ca' 'ne' 'fl' 'mi' 'mo' 'tn' 'ak'
 'ri' 'al' 'ga' 'tx' 'pa' 'ma' 'az' 'va' 'wa' 'wi' 'hi' 'sd' 'la' 'ut'
 'in' 'mn' 'md' 'ky' 'or' 'nm' 'il' 'mt' 'ok' 'dc' 'ms' 'sc' 'ks' 'nv'
 'ar' 'ct' 'id' 'nh' 'wy' 'me' 'nd' 'de' 'wv' 'vt']


In [8]:
print("\nUnique values of 'state_only' in df_usajobs before processing:")
print(df_usajobs['State'].unique())


Unique values of 'state_only' in df_usajobs before processing:
['Virgin Islands' 'California' 'Unknown' 'Arizona' 'Florida' 'Tennessee'
 'Washington' 'South Dakota' 'Minnesota' 'Colorado' 'Montana' 'New York'
 'North Carolina' 'Michigan' 'Pennsylvania' 'Texas' 'Guam' 'New Jersey'
 'Louisiana' 'District of Columbia' 'Virginia' 'Alaska' 'New Hampshire'
 'Missouri' 'Illinois' 'Maryland' 'West Virginia' 'Maine' 'Oregon'
 'North Dakota' 'Hawaii' 'Nevada' 'Wyoming' 'Georgia' 'New Mexico'
 'Massachusetts' 'Nebraska' 'South Carolina' 'Oklahoma' 'Arkansas'
 'Kansas' 'Connecticut' 'Rhode Island' 'Alabama' 'Delaware' 'Ohio'
 'Puerto Rico' 'Mississippi' 'Kentucky' 'Wisconsin' 'Utah' 'Indiana'
 'Vermont' 'Idaho' 'Northern Mariana Islands' 'Iowa' 'American Samoa']


In [9]:
url = 'https://gist.githubusercontent.com/mshafrir/2646763/raw/states_titlecase.json'
state_list = json.load(urlopen(url))
abbr_map = {item['abbreviation']: item['abbreviation'] for item in state_list}
name_map = {item['name'].lower(): item['abbreviation'] for item in state_list}

In [10]:
def extract_state(loc: str) -> str:
    if pd.isna(loc) or not isinstance(loc, str):
        return 'OTHER'
    for frag in reversed([f.strip() for f in loc.split(',')]):
        code = frag.upper()
        name = frag.lower()
        if code in abbr_map:
            return code
        if name in name_map:
            return name_map[name]
    return 'OTHER'

In [11]:
df_linkedin['state_only'] = df_linkedin['state_only'].apply(extract_state)
df_usajobs = df_usajobs.rename(columns={'State': 'state_only'})
df_usajobs['state_only'] = df_usajobs['state_only'].apply(extract_state)

In [24]:
print("\nColumns in df_usajobs after state processing:")
print(df_usajobs.columns.tolist())


Columns in df_usajobs after state processing:
['PositionID', 'PositionTitle', 'PositionURI', 'Location', 'City', 'state_only', 'Country', 'Latitude', 'Longitude', 'Organization', 'Department', 'MinSalary', 'MaxSalary', 'SalaryInterval', 'industry_category', 'JobGrade', 'Schedule', 'OfferingType', 'StartDate', 'EndDate', 'CloseDate', 'remote_allowed', 'SecurityClearance', 'PromotionPotential', 'TravelCode', 'HiringPath', 'TotalOpenings', 'normalized_salary', 'original_listed_month', 'original_listed_year']


In [13]:
df_usajobs = df_usajobs.rename(columns={
    'PositionID': 'job_id',
    'NormalisedSalary': 'normalized_salary',
    'TeleworkEligible': 'remote_allowed',
    'JobCategory': 'industry_category',
    'PublicationDate': 'original_listed_time'
})

In [14]:
df_usajobs['original_listed_month'] = pd.to_datetime(df_usajobs['original_listed_time'], errors='coerce').dt.month_name().fillna('OTHER')
df_usajobs['original_listed_year'] = pd.to_datetime(df_usajobs['original_listed_time'], errors='coerce').dt.year.fillna(0).astype(int)
df_usajobs = df_usajobs.drop(columns=['original_listed_time'])  # Clean up temporary column

In [15]:
df_linkedin['normalized_salary'] = pd.to_numeric(df_linkedin['normalized_salary'], errors='coerce').fillna(0)
df_usajobs['normalized_salary'] = pd.to_numeric(df_usajobs['normalized_salary'], errors='coerce').fillna(0)

In [16]:
df_linkedin['industry_category'] = df_linkedin['industry_category'].fillna('OTHER')
df_usajobs['industry_category'] = df_usajobs['industry_category'].fillna('OTHER')

### **Map JobCategory to industry_category**

In [17]:
category_patterns = {
    r'\b(manufacturing|production|fabrication)\b': 'Manufacturing',
    r'\b(tech|it|information|computer|software|internet|data)\b': 'Technology & IT',
    r'\b(health|medical|pharma|bio|dental|clinic|veterinary)\b': 'Healthcare & Life Sciences',
    r'\b(finance|bank|insurance|investment|accounting)\b': 'Finance & Insurance',
    r'\b(retail|e-commerce|fashion|apparel|luxury)\b': 'Retail & Consumer Goods',
    r'\b(education|e-learning|school|training|academic)\b': 'Education',
    r'\b(government|public|law|justice|military)\b': 'Government & Public Sector',
    r'\b(media|entertainment|arts|sports|hospitality|travel)\b': 'Media, Entertainment & Hospitality',
    r'\b(energy|oil|gas|mining|utilities|power|solar|wind)\b': 'Energy, Mining & Utilities',
    r'\b(construction|real estate|architecture|engineering)\b': 'Construction & Real Estate',
    r'\b(transportation|logistics|supply chain|automotive|aerospace)\b': 'Transportation & Logistics',
    r'\b(food|beverage|restaurants|catering)\b': 'Food & Beverage Services',
    r'\b(non-?profit|charity|community)\b': 'Non-Profit & Social Organizations',
    r'\b(agriculture|farming|forestry|horticulture)\b': 'Agriculture & Forestry',
    r'other': 'OTHER'
}

In [18]:
def map_industry_category(category):
    if pd.isna(category) or category == 'OTHER':
        return 'OTHER'
    category = category.lower()
    for pattern, mapped_category in category_patterns.items():
        if re.search(pattern, category):
            return mapped_category
    return 'OTHER'

In [19]:
df_usajobs['industry_category'] = df_usajobs['industry_category'].apply(map_industry_category)

### **Align Columns for Concatenation**

In [20]:
final_columns = [
    'job_id', 'company_id', 'company_name', 'company_size', 'employee_count', 'follower_count',
    'views', 'applies', 'formatted_work_type', 'remote_allowed', 'application_type',
    'formatted_experience_level', 'normalized_salary', 'len_description', 'state_only',
    'original_listed_month', 'original_listed_year', 'has_benefits', 'benefits_count',
    'industry_category', 'skills_list'
]

In [21]:
for col in final_columns:
    if col not in df_linkedin.columns:
        if col in ['company_id', 'company_size', 'employee_count', 'follower_count', 'views', 'applies', 'len_description', 'has_benefits', 'benefits_count']:
            df_linkedin[col] = 0
        else:
            df_linkedin[col] = 'Unknown'

In [22]:
df_usajobs_aligned = pd.DataFrame(columns=final_columns)

In [23]:
df_usajobs_aligned['job_id'] = df_usajobs['job_id']
df_usajobs_aligned['company_id'] = 0  # Not available in df_usajobs
df_usajobs_aligned['company_name'] = 'Unknown'  # Not directly available, could map from Organization if needed
df_usajobs_aligned['company_size'] = 0
df_usajobs_aligned['employee_count'] = 0
df_usajobs_aligned['follower_count'] = 0
df_usajobs_aligned['views'] = 0
df_usajobs_aligned['applies'] = 0
df_usajobs_aligned['formatted_work_type'] = 'Unknown'  # Not available, set default
df_usajobs_aligned['remote_allowed'] = df_usajobs['remote_allowed']
df_usajobs_aligned['application_type'] = 'Unknown'  # Not available, set default
df_usajobs_aligned['formatted_experience_level'] = 'Unknown'  # Not available, set default
df_usajobs_aligned['normalized_salary'] = df_usajobs['normalized_salary']
df_usajobs_aligned['len_description'] = 0  # Not available
df_usajobs_aligned['state_only'] = df_usajobs['state_only']
df_usajobs_aligned['original_listed_month'] = df_usajobs['original_listed_month']
df_usajobs_aligned['original_listed_year'] = df_usajobs['original_listed_year']
df_usajobs_aligned['has_benefits'] = 0
df_usajobs_aligned['benefits_count'] = 0
df_usajobs_aligned['industry_category'] = df_usajobs['industry_category']
df_usajobs_aligned['skills_list'] = ''  # Not available

KeyError: 'job_id'

In [21]:
df_usajobs_aligned['job_id'] = df_usajobs_aligned['job_id'].astype(str) 
df_linkedin['job_id'] = df_linkedin['job_id'].astype(str)

### **Concatenate the Datasets**

In [None]:
df_merged = pd.concat([df_linkedin[final_columns], df_usajobs_aligned], ignore_index=True)

In [None]:
duplicates = df_merged['job_id'].duplicated().sum()
print(f"Number of duplicated job_ids: {duplicates}")

In [25]:
df_merged = df_merged.drop_duplicates(subset=['job_id'], keep='first')

### **Verification**

In [None]:
print("=== Merged Data Info ===")
print(df_merged.info())
print("\n=== Missing Values in Merged Data ===")
print(df_merged.isna().sum())
print("\n=== First 5 Rows of Merged Data ===")
print(df_merged.head())

### **Save Merged Data**

In [1]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
merged_file = f"data/merged/linkedin_usajobs_merged_{timestamp}.csv"
df_merged.to_csv(merged_file, index=False)
print(f"Merged data saved to {merged_file}")

NameError: name 'datetime' is not defined

In [None]:
#engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

#df_merged.to_sql(
#    'linkedin_usajobs_merged',
#    con=engine,
#    schema='dimensional_model', #TODO maybe create a new schema for this 
#    if_exists='replace',
#    index=False
#)
#print("Merged data saved to dimensional_model.linkedin_usajobs_merged")