## **PROJECT - NOTEBOOK #4: Merge LinkedIn and USAJOBS Data**

---

### **Setting Environment**

In [1]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from datetime import datetime
import re

In [2]:
print(os.getcwd())

try:
    os.chdir("../../project_etl")
except FileNotFoundError:
    print("""
        FileNotFoundError - The directory may not exist or you might not be in the specified path.
        If this has already worked, do not run this block again, as the current directory is already set to project_etl.
        """)
    
print(os.getcwd())

d:\U\FIFTH SEMESTER\ETL\project_etl\notebooks
d:\U\FIFTH SEMESTER\ETL\project_etl


### **Importing Data**

In [3]:
#engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

In [4]:
#df_linkedin = pd.read_sql_table('merge', schema='dimensional_model', con=engine)
df_linkedin = pd.read_csv('data_merged/merge.csv')

In [5]:
usajobs_file = [f for f in os.listdir('.') if f.startswith('usajobs_data_') and f.endswith('.csv')][0]
df_usajobs = pd.read_csv(usajobs_file)

In [6]:
print("LinkedIn Data Info:")
print(df_linkedin.info())
print("\nUSAJOBS Data Info:")
print(df_usajobs.info())

LinkedIn Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                123849 non-null  object 
 2   company_id                  123849 non-null  int64  
 3   views                       122160 non-null  float64
 4   formatted_work_type         123849 non-null  object 
 5   applies                     123849 non-null  int64  
 6   remote_allowed              123849 non-null  int64  
 7   application_type            123849 non-null  object 
 8   formatted_experience_level  123849 non-null  object 
 9   normalized_salary           36073 non-null   float64
 10  len_description             123842 non-null  float64
 11  state_only                  123849 non-null  object 
 12  original_listed_month       123849 non-null  object 

### **Data Preprocessing**

In [7]:
df_usajobs = df_usajobs.rename(columns={
    'State': 'state_only',
    'NormalisedSalary': 'normalized_salary',
    'TeleworkEligible': 'remote_allowed',
    'JobCategory': 'industry_category',
    'PublicationDate': 'original_listed_time',  # Temporary rename for month/year extraction
    'PositionID': 'job_id' 
})

In [8]:
df_usajobs['original_listed_month'] = pd.to_datetime(df_usajobs['original_listed_time'], errors='coerce').dt.month_name().fillna('Unknown')
df_usajobs['original_listed_year'] = pd.to_datetime(df_usajobs['original_listed_time'], errors='coerce').dt.year.fillna(0).astype(int)
df_usajobs = df_usajobs.drop(columns=['original_listed_time'])  # Clean up temporary column

In [9]:
state_map = {
    'AL': 'al', 'AK': 'ak', 'AZ': 'az', 'AR': 'ar', 'CA': 'ca', 'CO': 'co', 'CT': 'ct', 'DE': 'de',
    'FL': 'fl', 'GA': 'ga', 'HI': 'hi', 'ID': 'id', 'IL': 'il', 'IN': 'in', 'IA': 'ia', 'KS': 'ks',
    'KY': 'ky', 'LA': 'la', 'ME': 'me', 'MD': 'md', 'MA': 'ma', 'MI': 'mi', 'MN': 'mn', 'MS': 'ms',
    'MO': 'mo', 'MT': 'mt', 'NE': 'ne', 'NV': 'nv', 'NH': 'nh', 'NJ': 'nj', 'NM': 'nm', 'NY': 'ny',
    'NC': 'nc', 'ND': 'nd', 'OH': 'oh', 'OK': 'ok', 'OR': 'or', 'PA': 'pa', 'RI': 'ri', 'SC': 'sc',
    'SD': 'sd', 'TN': 'tn', 'TX': 'tx', 'UT': 'ut', 'VT': 'vt', 'VA': 'va', 'WA': 'wa', 'WV': 'wv',
    'WI': 'wi', 'WY': 'wy', 'UNKNOWN': 'unknown', 'OTHER': 'other'
}

In [10]:
df_linkedin['state_only'] = df_linkedin['state_only'].str.upper().map(state_map).fillna('unknown')
df_usajobs['state_only'] = df_usajobs['state_only'].str.upper().map(state_map).fillna('unknown')

In [11]:
df_linkedin['normalized_salary'] = pd.to_numeric(df_linkedin['normalized_salary'], errors='coerce').fillna(0)
df_usajobs['normalized_salary'] = pd.to_numeric(df_usajobs['normalized_salary'], errors='coerce').fillna(0)

In [12]:
df_linkedin['industry_category'] = df_linkedin['industry_category'].fillna('Unknown')
df_usajobs['industry_category'] = df_usajobs['industry_category'].fillna('Unknown')

### **Map JobCategory to industry_category**

In [13]:
category_patterns = {
    r'\b(manufacturing|production|fabrication)\b': 'Manufacturing',
    r'\b(tech|it|information|computer|software|internet|data)\b': 'Technology & IT',
    r'\b(health|medical|pharma|bio|dental|clinic|veterinary)\b': 'Healthcare & Life Sciences',
    r'\b(finance|bank|insurance|investment|accounting)\b': 'Finance & Insurance',
    r'\b(retail|e-commerce|fashion|apparel|luxury)\b': 'Retail & Consumer Goods',
    r'\b(education|e-learning|school|training|academic)\b': 'Education',
    r'\b(government|public|law|justice|military)\b': 'Government & Public Sector',
    r'\b(media|entertainment|arts|sports|hospitality|travel)\b': 'Media, Entertainment & Hospitality',
    r'\b(energy|oil|gas|mining|utilities|power|solar|wind)\b': 'Energy, Mining & Utilities',
    r'\b(construction|real estate|architecture|engineering)\b': 'Construction & Real Estate',
    r'\b(transportation|logistics|supply chain|automotive|aerospace)\b': 'Transportation & Logistics',
    r'\b(food|beverage|restaurants|catering)\b': 'Food & Beverage Services',
    r'\b(non-?profit|charity|community)\b': 'Non-Profit & Social Organizations',
    r'\b(agriculture|farming|forestry|horticulture)\b': 'Agriculture & Forestry',
    r'other': 'Other'
}

In [14]:
def map_industry_category(category):
    if pd.isna(category) or category == 'Unknown':
        return 'Unknown'
    category = category.lower()
    for pattern, mapped_category in category_patterns.items():
        if re.search(pattern, category):
            return mapped_category
    return 'Other'

In [15]:
df_usajobs['industry_category'] = df_usajobs['industry_category'].apply(map_industry_category)

### **Align Columns for Concatenation**

In [16]:
final_columns = [
    'job_id', 'company_id', 'company_name', 'company_size', 'employee_count', 'follower_count',
    'views', 'applies', 'formatted_work_type', 'remote_allowed', 'application_type',
    'formatted_experience_level', 'normalized_salary', 'len_description', 'state_only',
    'original_listed_month', 'original_listed_year', 'has_benefits', 'benefits_count',
    'industry_category', 'skills_list'
]

In [17]:
for col in final_columns:
    if col not in df_linkedin.columns:
        if col in ['company_id', 'company_size', 'employee_count', 'follower_count', 'views', 'applies', 'len_description', 'has_benefits', 'benefits_count']:
            df_linkedin[col] = 0
        else:
            df_linkedin[col] = 'Unknown'

In [18]:
df_usajobs_aligned = pd.DataFrame(columns=final_columns)

In [20]:
df_usajobs_aligned['job_id'] = df_usajobs['job_id']
df_usajobs_aligned['company_id'] = 0  # Not available in df_usajobs
df_usajobs_aligned['company_name'] = 'Unknown'  # Not directly available, could map from Organization if needed
df_usajobs_aligned['company_size'] = 0
df_usajobs_aligned['employee_count'] = 0
df_usajobs_aligned['follower_count'] = 0
df_usajobs_aligned['views'] = 0
df_usajobs_aligned['applies'] = 0
df_usajobs_aligned['formatted_work_type'] = 'Unknown'  # Not available, set default
df_usajobs_aligned['remote_allowed'] = df_usajobs['remote_allowed']
df_usajobs_aligned['application_type'] = 'Unknown'  # Not available, set default
df_usajobs_aligned['formatted_experience_level'] = 'Unknown'  # Not available, set default
df_usajobs_aligned['normalized_salary'] = df_usajobs['normalized_salary']
df_usajobs_aligned['len_description'] = 0  # Not available
df_usajobs_aligned['state_only'] = df_usajobs['state_only']
df_usajobs_aligned['original_listed_month'] = df_usajobs['original_listed_month']
df_usajobs_aligned['original_listed_year'] = df_usajobs['original_listed_year']
df_usajobs_aligned['has_benefits'] = 0
df_usajobs_aligned['benefits_count'] = 0
df_usajobs_aligned['industry_category'] = df_usajobs['industry_category']
df_usajobs_aligned['skills_list'] = ''  # Not available

In [21]:
df_usajobs_aligned['job_id'] = df_usajobs_aligned['job_id'].astype(str) 
df_linkedin['job_id'] = df_linkedin['job_id'].astype(str)

### **Concatenate the Datasets**

In [22]:
df_merged = pd.concat([df_linkedin[final_columns], df_usajobs_aligned], ignore_index=True)

In [23]:
duplicates = df_merged['job_id'].duplicated().sum()
print(f"Number of duplicated job_ids: {duplicates}")

Number of duplicated job_ids: 4


In [24]:
df_merged = df_merged.drop_duplicates(subset=['job_id'], keep='first')

### **Fill Missing Values**

### **Verification**

In [27]:
print("=== Merged Data Info ===")
print(df_merged.info())
print("\n=== Missing Values in Merged Data ===")
print(df_merged.isna().sum())
print("\n=== First 5 Rows of Merged Data ===")
print(df_merged.head())

=== Merged Data Info ===
<class 'pandas.core.frame.DataFrame'>
Index: 124731 entries, 0 to 124734
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      124731 non-null  object 
 1   company_id                  124731 non-null  int64  
 2   company_name                124731 non-null  object 
 3   company_size                124731 non-null  int64  
 4   employee_count              124731 non-null  int64  
 5   follower_count              124731 non-null  int64  
 6   views                       124731 non-null  float64
 7   applies                     124731 non-null  int64  
 8   formatted_work_type         124731 non-null  object 
 9   remote_allowed              124731 non-null  bool   
 10  application_type            124731 non-null  object 
 11  formatted_experience_level  124731 non-null  object 
 12  normalized_salary           124731 non-null  float64

### **Save Merged Data**

In [28]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
merged_file = f"linkedin_usajobs_merged_{timestamp}.csv"
df_merged.to_csv(merged_file, index=False)
print(f"Merged data saved to {merged_file}")

Merged data saved to linkedin_usajobs_merged_20250520_120607.csv


In [None]:
engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

df_merged.to_sql(
    'linkedin_usajobs_merged',
    con=engine,
    schema='dimensional_model', #TODO maybe create a new schema for this 
    if_exists='replace',
    index=False
)
print("Merged data saved to dimensional_model.linkedin_usajobs_merged")