In [2]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import re

In [3]:
engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

In [4]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [5]:
df = pd.read_sql_table('postings', schema='raw', con=engine)

In [6]:
df['len_description'] = df['description'].str.len()

to_drop = [
    'description','job_posting_url','application_url','posting_domain',
    'compensation_type','fips','work_type','sponsored','listed_time',
    'expiry','closed_time','skills_desc','title'
]
df = df.drop(columns=to_drop)


In [7]:
import json
from urllib.request import urlopen
url = 'https://gist.githubusercontent.com/mshafrir/2646763/raw/states_titlecase.json'
state_list = json.load(urlopen(url))
abbr_map = {item['abbreviation']: item['abbreviation'] for item in state_list}
name_map = {item['name'].lower(): item['abbreviation'] for item in state_list}

def extract_state(loc: str) -> str:
    if pd.isna(loc) or not isinstance(loc, str):
        return 'UNKNOWN'
    for frag in reversed([f.strip() for f in loc.split(',')]):
        code = frag.upper()
        name = frag.lower()
        if code in abbr_map:
            return code
        if name in name_map:
            return name_map[name]
    return 'OTHER'

df['state_only'] = df['location'].apply(extract_state)
df = df.drop(columns=['location'])


In [8]:
df['applies']                  = df['applies'].fillna(0).astype(int)
df['company_name']             = df['company_name'].fillna('Unknown')
df['formatted_experience_level']= df['formatted_experience_level'].fillna('Unknown')
df['remote_allowed']           = df['remote_allowed'].fillna(0).astype('Int8')
df['company_id']               = df['company_id'].fillna(-1).astype(int)

In [9]:
dt = pd.to_datetime(df['original_listed_time'], unit='ms')
df['original_listed_month'] = dt.dt.month_name()
df['original_listed_year']  = dt.dt.year

df = df.drop(columns=['original_listed_time','zip_code'])

In [10]:
exchange_rates = {
    'USD': 1.0, 'EUR': 1.10, 'CAD': 0.75,
    'BBD': 0.50,'AUD': 0.65,'GBP': 1.25,
}
for col in ['min_salary','med_salary','max_salary']:
    rate = df['currency'].map(exchange_rates).fillna(1.0)
    df[col] = df[col] * rate
df = df.drop(columns=['currency'])

factor = {'HOURLY':2080,'MONTHLY':12,'WEEKLY':52,'BIWEEKLY':26,'YEARLY':1}

df[['min_salary','med_salary','max_salary']] = df[
    ['min_salary','med_salary','max_salary']
].apply(pd.to_numeric, errors='coerce')

df['min_salary_annual'] = df['min_salary'] * df['pay_period'].map(factor)
df['max_salary_annual'] = df['max_salary'] * df['pay_period'].map(factor)
df['med_salary_annual'] = df['med_salary'] * df['pay_period'].map(factor)

df['normalized_salary'] = df[
    ['min_salary_annual','med_salary_annual','max_salary_annual']
].mean(axis=1)
q1, q3 = df['normalized_salary'].quantile([0.25, 0.75])
low  = max(q1 - 1.5*(q3 - q1), 0)
high = q3 + 1.5*(q3 - q1)
df['normalized_salary'] = df['normalized_salary'].clip(low, high)

to_drop_salary = [
    'min_salary','med_salary','max_salary',
    'min_salary_annual','med_salary_annual','max_salary_annual',
    'pay_period'
]
df = df.drop(columns=to_drop_salary)


In [11]:
df.to_sql(
    'postings',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

849

In [12]:
df_benefits = pd.read_sql_table('benefits', schema='raw', con=engine)

In [13]:
df_benefits = df_benefits[df_benefits['job_id'].isin(df['job_id'])]

In [14]:
df_benefits['type'] = df_benefits['type'].str.strip().str.lower()
df_benefits = (
    df_benefits
    .dropna(subset=['job_id','type'])
    .drop_duplicates(subset=['job_id','type'])
)

In [15]:
df_benefits = (
    df_benefits
    .groupby('job_id', as_index=False)
    .agg(benefits_count=('type','size'))
)
df_benefits['has_benefits'] = (df_benefits['benefits_count'] > 0).astype(int)

In [16]:
df_benefits.to_sql(
    'benefits',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

735

In [17]:
df_job_industries = pd.read_sql_table('job_industries', schema='raw', con=engine)

In [18]:
df_job_industries = df_job_industries[df_job_industries['job_id'].isin(df['job_id'])]

In [19]:
df_job_industries = df_job_industries.drop_duplicates().reset_index(drop=True)

In [20]:
df_industries = pd.read_sql_table('industries', schema='raw', con=engine)[['industry_id']]
df_job_industries = df_job_industries[
    df_job_industries['industry_id'].isin(df_industries['industry_id'])
]

In [21]:
df_job_industries.to_sql(
    'job_industries',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

963

In [22]:
df_job_skills = pd.read_sql_table('job_skills', schema='raw', con=engine)

In [23]:
df_job_skills = df_job_skills[df_job_skills['job_id'].isin(df['job_id'])]

In [24]:
df_job_skills['skill_abr'] = df_job_skills['skill_abr'].str.strip().str.upper()
df_job_skills = df_job_skills.dropna(subset=['job_id', 'skill_abr'])
df_job_skills = df_job_skills.drop_duplicates(subset=['job_id', 'skill_abr']).reset_index(drop=True)

In [25]:
df_job_skills.to_sql(
    'job_skills',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

778

In [26]:
df_companies = pd.read_sql_table('companies', schema='raw', con=engine)

In [27]:
columns_to_keep     = ['company_id', 'name', 'company_size']
columns_from_table = df_companies.columns.tolist()
actual_columns_to_keep = [col for col in columns_to_keep if col in columns_from_table]
df_companies = df_companies[actual_columns_to_keep]

In [28]:
if 'name' in df_companies.columns:
    df_companies['name'] = (
        df_companies['name']
        .fillna('Unknown')
        .str.strip()
        .str.title()
    )
    
if 'company_size' in df_companies.columns:
    df_companies['company_size'] = (
        df_companies['company_size']
        .fillna(0)
        .astype('Int8')
    )
    
if 'company_id' in df_companies.columns:
    df_companies = df_companies.dropna(subset=['company_id'])
    df_companies['company_id'] = df_companies['company_id'].astype(int)
    df_companies = (
        df_companies
        .drop_duplicates(subset=['company_id'])
        .reset_index(drop=True)
    )

In [29]:
df_companies.to_sql(
    'companies',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

473

In [30]:
df_emp_counts = pd.read_sql_table('employee_counts', schema='raw', con=engine)

In [31]:
df_emp_counts = df_emp_counts.drop(columns='time_recorded')
df_emp_counts = df_emp_counts.drop_duplicates(subset='company_id', keep='first')

In [32]:
df_emp_counts.to_sql(
    'employee_counts',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

473

In [33]:
df_industries = pd.read_sql_table('industries', schema='raw', con=engine)

In [34]:
df_industries['industry_name'] = (
    df_industries['industry_name']
    .fillna('Unknown')
    .str.strip()
    .str.title()
)

In [35]:
df_industries = (
    df_industries
    .drop_duplicates(subset=['industry_id'])
    .reset_index(drop=True)
)

In [36]:
# ────────────── Cell 4: Categorización por patrones ──────────────
import re

patterns = [
    (r'\b(manufacturing|production|fabrication)\b',               'Manufacturing'),
    (r'\b(tech|it|information|computer|software|internet|data)\b', 'Technology & IT'),
    (r'\b(health|medical|pharma|bio|dental|clinic|veterinary)\b',  'Healthcare & Life Sciences'),
    (r'\b(finance|bank|insurance|investment|accounting)\b',        'Finance & Insurance'),
    (r'\b(retail|e-commerce|fashion|apparel|luxury)\b',            'Retail & Consumer Goods'),
    (r'\b(education|e-learning|school|training|academic)\b',       'Education'),
    (r'\b(government|public|law|justice|military)\b',              'Government & Public Sector'),
    (r'\b(media|entertainment|arts|sports|hospitality|travel)\b',  'Media, Entertainment & Hospitality'),
    (r'\b(energy|oil|gas|mining|utilities|power|solar|wind)\b',     'Energy, Mining & Utilities'),
    (r'\b(construction|real estate|architecture|engineering)\b',   'Construction & Real Estate'),
    (r'\b(transportation|logistics|supply chain|automotive|aerospace)\b','Transportation & Logistics'),
    (r'\b(food|beverage|restaurants|catering)\b',                  'Food & Beverage Services'),
    (r'\b(non-?profit|charity|community)\b',                       'Non-Profit & Social Organizations'),
    (r'\b(agriculture|farming|forestry|horticulture)\b',            'Agriculture & Forestry'),
    (r'other',                                                    'Other'),
]

def categorize(name: str) -> str:
    nl = name.lower()
    for pat, cat in patterns:
        if re.search(pat, nl):
            return cat
    return 'Other'

df_industries['industry_category'] = df_industries['industry_name'].apply(categorize)


In [37]:
df_industries = df_industries.drop(columns=['industry_name'])

df_industries.to_sql(
    'industries',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

422

In [38]:
df_skills = pd.read_sql_table('skills', schema='raw', con=engine)

df_skills['skill_abr']  = df_skills['skill_abr'].str.strip().str.upper()
df_skills['skill_name'] = df_skills['skill_name'].str.strip().str.title()

df_skills = df_skills.drop_duplicates(subset=['skill_abr']).reset_index(drop=True)
skill_map = dict(zip(df_skills['skill_abr'], df_skills['skill_name']))

df_skills.to_sql(
    'skills_lookup',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)

35

In [39]:
df_js = pd.read_sql_table('job_skills', schema='cleaned', con=engine)

df_job_skills_list = (
    df_js
    .groupby('job_id')['skill_abr']
    .agg(','.join)                   
    .reset_index()
    .rename(columns={'skill_abr':'skills_list'})
)
df_job_skills_list['skills_list'] = (
    df_job_skills_list['skills_list']
    .str.split(',')
    .apply(lambda codes: [skill_map.get(code, code) for code in codes])
    .apply(lambda names: ','.join(names))
)

df_job_skills_list.to_sql(
    'job_skills_list',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)


96

In [40]:
df_ji  = pd.read_sql_table('job_industries', schema='cleaned', con=engine)
df_ind = pd.read_sql_table('industries',    schema='cleaned', con=engine)

df_job_industry_cat = (
    df_ji
    .merge(df_ind[['industry_id','industry_category']], on='industry_id', how='left')
    .drop_duplicates(subset='job_id', keep='first')
    .reset_index(drop=True)
)

df_job_industry_cat.to_sql(
    'job_industries_category',
    con=engine,
    schema='cleaned',
    if_exists='replace',
    index=False
)


413

In [41]:
df              = pd.read_sql_table('postings',               schema='cleaned',         con=engine)
df_benefits     = pd.read_sql_table('benefits',              schema='cleaned',         con=engine)
df_ind_cat      = pd.read_sql_table('job_industries_category', schema='cleaned',       con=engine)
df_skills_list  = pd.read_sql_table('job_skills_list',       schema='cleaned',         con=engine)
df_companies    = pd.read_sql_table('companies',             schema='cleaned',         con=engine)
df_emp_counts   = pd.read_sql_table('employee_counts',       schema='cleaned',         con=engine)


In [42]:
df_merged = df.copy()

df_merged = (
    df_merged
    .merge(
        df_benefits[['job_id','has_benefits','benefits_count']],
        on='job_id', how='left'
    )
    .fillna({'has_benefits':0, 'benefits_count':0})
)

df_merged = (
    df_merged
    .merge(
        df_ind_cat[['job_id','industry_category']],
        on='job_id', how='left'
    )
    .fillna({'industry_category':'Unknown'})
)

df_merged = (
    df_merged
    .merge(
        df_skills_list[['job_id','skills_list']],
        on='job_id', how='left'
    )
    .fillna({'skills_list':''})
)

df_merged = (
    df_merged
    .merge(
        df_companies[['company_id','company_size']],
        on='company_id', how='left'
    )
    .fillna({'company_size':0})
    .astype({'company_size':int})
)

df_merged = (
    df_merged
    .merge(
        df_emp_counts[['company_id','employee_count','follower_count']],
        on='company_id', how='left'
    )
    .fillna({'employee_count':0,'follower_count':0})
    .astype({'employee_count':int,'follower_count':int})
)


In [44]:
text_cols = [
    'company_name',
    'formatted_work_type',
    'application_type',
    'formatted_experience_level',
    'state_only',
    'original_listed_month',
    'industry_category',
    'skills_list'
]

for col in text_cols:
    df_merged[col] = df_merged[col].fillna('').astype(str).str.lower()

In [45]:
df_merged.to_sql(
    'merge',
    con=engine,
    schema='dimensional_model',
    if_exists='replace',
    index=False
)

849