In [204]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import re

In [205]:
engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

In [206]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [207]:
df = pd.read_sql_table('postings', schema='raw', con=engine)

In [208]:
df = df.drop_duplicates(subset=['job_id'])
df['job_id']     = df['job_id'].astype(str)
df['company_id'] = df['company_id'].astype(str)
df['zip_code'] = df['zip_code'].astype(str).replace('<NA>', '')

In [209]:
df['listed_time'] = pd.to_datetime(df['listed_time'], unit='ms', errors='coerce')
df['listed_date'] = df['listed_time'].dt.date
df['listed_hour'] = df['listed_time'].dt.hour
df['listed_wday'] = df['listed_time'].dt.day_name()

In [210]:
df['listed_date'] = pd.to_datetime(df['listed_date'], errors='coerce')

In [211]:
def to_annual(row):
    val = row['med_salary'] if not pd.isna(row['med_salary']) else row['max_salary']
    pp  = row['pay_period']
    if pd.isna(val):
        return np.nan
    if pp == 'HOURLY':
        return val * 40 * 52
    if pp == 'MONTHLY':
        return val * 12
    if pp == 'WEEKLY':
        return val * 52
    if pp == 'BIWEEKLY':
        return val * 26
    return val

df['salary_annual'] = df.apply(to_annual, axis=1)

In [212]:
df['remote_allowed'] = df['remote_allowed'].fillna(0).astype(bool)
df['sponsored']      = df['sponsored'].astype(bool)
df['formatted_experience_level'] = (
    df['formatted_experience_level']
      .fillna('Not specified')
)

In [213]:
state_abbrev = {
    'Alabama':'AL','Alaska':'AK','Arizona':'AZ','Arkansas':'AR','California':'CA',
    'Colorado':'CO','Connecticut':'CT','Delaware':'DE','Florida':'FL','Georgia':'GA',
    'Hawaii':'HI','Idaho':'ID','Illinois':'IL','Indiana':'IN','Iowa':'IA','Kansas':'KS',
    'Kentucky':'KY','Louisiana':'LA','Maine':'ME','Maryland':'MD','Massachusetts':'MA',
    'Michigan':'MI','Minnesota':'MN','Mississippi':'MS','Missouri':'MO','Montana':'MT',
    'Nebraska':'NE','Nevada':'NV','New Hampshire':'NH','New Jersey':'NJ','New Mexico':'NM',
    'New York':'NY','North Carolina':'NC','North Dakota':'ND','Ohio':'OH','Oklahoma':'OK',
    'Oregon':'OR','Pennsylvania':'PA','Rhode Island':'RI','South Carolina':'SC','South Dakota':'SD',
    'Tennessee':'TN','Texas':'TX','Utah':'UT','Vermont':'VT','Virginia':'VA','Washington':'WA',
    'West Virginia':'WV','Wisconsin':'WI','Wyoming':'WY','District of Columbia':'DC'
}
us_states = set(state_abbrev.values())


In [214]:
def parse_location_refined(loc: str) -> pd.Series:
    parts = [p.strip() for p in loc.split(',')]
    suffix_pattern = r'\s*(Metropolitan Area|Area)$'
    if len(parts) == 1:
        raw = parts[0]
        city = re.sub(suffix_pattern, '', raw)
        if raw in state_abbrev:
            return pd.Series({'city':'', 'state':state_abbrev[raw], 'country':'United States'})
        if raw in us_states:
            return pd.Series({'city':'', 'state':raw, 'country':'United States'})
        if raw == 'United States':
            return pd.Series({'city':'', 'state':'', 'country':'United States'})
        return pd.Series({'city':city, 'state':'', 'country':''})
    
    if len(parts) == 2:
        city_raw, sec = parts
        city = re.sub(suffix_pattern, '', city_raw)
        sec_clean = re.sub(suffix_pattern, '', sec)
        if sec_clean in us_states or sec_clean in state_abbrev:
            state = sec_clean if sec_clean in us_states else state_abbrev[sec_clean]
            return pd.Series({'city':city, 'state':state, 'country':'United States'})
        else:
            return pd.Series({'city':city, 'state':'', 'country':sec_clean})
    city = re.sub(suffix_pattern, '', parts[0])
    sec_clean = re.sub(suffix_pattern, '', parts[-2])
    state = sec_clean if sec_clean in us_states else state_abbrev.get(sec_clean, '')
    country = parts[-1]
    return pd.Series({'city':city, 'state':state, 'country':country})

parsed = df['location'].apply(parse_location_refined)
df = df.drop(columns=['city','state','country'], errors='ignore')
df = pd.concat([df, parsed], axis=1)

In [215]:
print("Top estados:")
print(df['state'].value_counts().head(20))
print("\nTop países:")
print(df['country'].value_counts().head(20))
print("\nTop ciudades:")
print(df['city'].value_counts().head(20))

Top estados:
      17266
CA    11700
TX    10666
NY     6187
FL     5961
NC     4954
IL     4494
PA     4158
VA     3678
OH     3574
MA     3531
GA     3430
NJ     3289
MI     2865
WA     2723
AZ     2524
CO     2337
MD     1977
MO     1947
TN     1887
Name: state, dtype: int64

Top países:
United States    118605
                   5240
ON                    1
Netherlands           1
QC                    1
The Gambia            1
Name: country, dtype: int64

Top ciudades:
                 8125
New York         3404
Chicago          1836
Houston          1776
Atlanta          1607
Dallas           1394
Los Angeles      1377
Austin           1325
Boston           1202
Washington       1118
Charlotte        1086
Phoenix          1062
Denver            906
San Francisco     887
San Diego         867
New York City     837
Seattle           819
San Antonio       769
Columbus          733
Philadelphia      732
Name: city, dtype: int64


In [216]:
df.loc[df['salary_annual'] == 0, 'salary_annual'] = pd.NA

In [217]:
cols_to_drop = [
    'max_salary', 'min_salary', 'med_salary', 'normalized_salary',
    'pay_period', 'currency', 'compensation_type',
    'original_listed_time', 'listed_time', 'expiry', 'closed_time',
    'job_posting_url', 'application_url', 'posting_domain',
    'description', 'skills_desc',
    'company_name', 'work_type',
    'fips'
]
df = df.drop(columns=cols_to_drop)

In [218]:
df_b = pd.read_sql_table('benefits', schema='raw', con=engine)
df_b = (
    df_b
    .rename(columns={'type': 'benefit_type'})
    .astype({'job_id': str})
    .drop_duplicates()
)
df_b.to_sql('benefits', schema='cleaned', con=engine, if_exists='replace', index=False)

943

In [219]:
df_ji = pd.read_sql_table('job_industries', schema='raw', con=engine)
df_ji = (
    df_ji
    .astype({'job_id': str, 'industry_id': str})
    .drop_duplicates(subset=['job_id','industry_id'])
)
df_ji.to_sql('job_industries', schema='cleaned', con=engine, if_exists='replace', index=False)

808

In [220]:
df_js = pd.read_sql_table('job_skills', schema='raw', con=engine)
df_js['job_id']    = df_js['job_id'].astype(str)
df_js['skill_abr'] = df_js['skill_abr'].str.upper()
df_js = df_js.drop_duplicates(subset=['job_id','skill_abr'])
df_js.to_sql('job_skills', schema='cleaned', con=engine, if_exists='replace', index=False)

768

In [221]:
df_s = pd.read_sql_table('salaries', schema='raw', con=engine)

def to_annual(row):
    val = row['med_salary'] if not pd.isna(row['med_salary']) else row['max_salary']
    pp  = row['pay_period']
    if pd.isna(val):
        return np.nan
    if pp == 'HOURLY':
        return val * 40 * 52
    if pp == 'MONTHLY':
        return val * 12
    if pp == 'WEEKLY':
        return val * 52
    if pp == 'BIWEEKLY':
        return val * 26
    return val

df_s['salary_annual'] = df_s.apply(to_annual, axis=1)
df_s = (
    df_s
    .assign(job_id=df_s['job_id'].astype(str))
    .groupby('job_id', as_index=False)['salary_annual']
    .median()
)
df_s.to_sql('salaries', schema='cleaned', con=engine, if_exists='replace', index=False)

785

In [222]:
df_c = pd.read_sql_table('companies', schema='raw', con=engine)
df_c['company_id'] = df_c['company_id'].astype(str)

median_size = df_c['company_size'].median()
df_c['company_size'] = df_c['company_size'].fillna(median_size)
for col in ['city','state','zip_code']:
    df_c[col] = df_c[col].fillna('')

df_c = df_c[['company_id','name','company_size','city','state','country','zip_code']]
df_c.to_sql('companies', schema='cleaned', con=engine, if_exists='replace', index=False)

473

In [223]:
df_ec = pd.read_sql_table('employee_counts', schema='raw', con=engine)
df_ec['company_id'] = df_ec['company_id'].astype(str)
df_ec['time_recorded'] = pd.to_datetime(df_ec['time_recorded'], unit='s', errors='coerce')
df_ec = (
    df_ec
    .sort_values('time_recorded')
    .drop_duplicates(subset=['company_id'], keep='last')
    [['company_id','employee_count','follower_count']]
)
df_ec.to_sql('employee_counts', schema='cleaned', con=engine, if_exists='replace', index=False)


473

In [224]:
df_i = pd.read_sql_table('industries', schema='raw', con=engine)
df_i['industry_id']   = df_i['industry_id'].astype(str)
df_i['industry_name'] = df_i['industry_name'].fillna('Unknown')
df_i = df_i.drop_duplicates(subset=['industry_id'])
df_i.to_sql('industries', schema='cleaned', con=engine, if_exists='replace', index=False)

422

In [225]:
df_sk = pd.read_sql_table('skills', schema='raw', con=engine)
df_sk = (
    df_sk
    .astype({'skill_abr': str, 'skill_name': str})
    .drop_duplicates(subset=['skill_abr'])
)
df_sk.to_sql('skills', schema='cleaned', con=engine, if_exists='replace', index=False)


35

In [226]:
df.to_sql(
    name='postings',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_b.to_sql(
    name='benefits',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_ji.to_sql(
    name='job_industries',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_js.to_sql(
    name='job_skills',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_s.to_sql(
    name='salaries',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_c.to_sql(
    name='companies',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_ec.to_sql(
    name='employee_counts',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_i.to_sql(
    name='industries',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)

df_sk.to_sql(
    name='skills',
    schema='cleaned',
    con=engine,
    if_exists='replace',
    index=False
)


35

In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123849 entries, 0 to 123848
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   job_id                      123849 non-null  object        
 1   title                       123849 non-null  object        
 2   location                    123849 non-null  object        
 3   company_id                  123849 non-null  object        
 4   views                       122160 non-null  float64       
 5   formatted_work_type         123849 non-null  object        
 6   applies                     23320 non-null   float64       
 7   remote_allowed              123849 non-null  bool          
 8   application_type            123849 non-null  object        
 9   formatted_experience_level  123849 non-null  object        
 10  sponsored                   123849 non-null  bool          
 11  zip_code                    123849 non-

In [228]:
df.describe()

Unnamed: 0,views,applies,listed_hour,salary_annual
count,122160.0,23320.0,123849.0,36059.0
mean,14.62,10.59,14.34,229282.45
std,85.9,29.05,7.3,5541950.63
min,1.0,1.0,0.0,1.0
25%,3.0,1.0,8.0,56160.0
50%,4.0,3.0,17.0,90000.0
75%,8.0,8.0,21.0,141439.5
max,9975.0,967.0,23.0,572000000.0


In [229]:
df.head()

Unnamed: 0,job_id,title,location,company_id,views,formatted_work_type,applies,remote_allowed,application_type,formatted_experience_level,sponsored,zip_code,listed_date,listed_hour,listed_wday,salary_annual,city,state,country
0,3884433322,Process Engineer,"Jim Thorpe, PA",36519.0,7.0,Full-time,2.0,False,ComplexOnsiteApply,Associate,False,18229.0,2024-04-05,19,Friday,90000.0,Jim Thorpe,PA,United States
1,3884433337,Accounting and Finance Consulting Opportunities,"Houston, TX",5235.0,5.0,Contract,1.0,False,ComplexOnsiteApply,Mid-Senior level,False,77002.0,2024-04-05,19,Friday,,Houston,TX,United States
2,3884433350,Account Sales Executive- Fresno/ Clovis,"Fresno County, CA",488066.0,5.0,Full-time,,False,ComplexOnsiteApply,Associate,False,93650.0,2024-04-05,19,Friday,,Fresno County,CA,United States
3,3884433360,2024-2025 Grade 1 Teacher Goynes ES,"Las Vegas, NV",9025.0,5.0,Full-time,,False,OffsiteApply,Entry level,False,89101.0,2024-04-05,20,Friday,,Las Vegas,NV,United States
4,3884433363,Healthcare QA,"Danvers, MA",3785467.0,9.0,Contract,,False,ComplexOnsiteApply,Not specified,False,1923.0,2024-04-05,19,Friday,,Danvers,MA,United States


In [230]:
df['country'].unique()

array(['United States', '', 'ON', 'Netherlands', 'QC', 'The Gambia'],
      dtype=object)

In [231]:
df.isna().sum()

job_id                             0
title                              0
location                           0
company_id                         0
views                           1689
formatted_work_type                0
applies                       100529
remote_allowed                     0
application_type                   0
formatted_experience_level         0
sponsored                          0
zip_code                           0
listed_date                        0
listed_hour                        0
listed_wday                        0
salary_annual                  87790
city                               0
state                              0
country                            0
dtype: int64

In [232]:
city_counts = (
    df['city']
      .fillna('')
      .value_counts()
      .reset_index()
      .rename(columns={'index': 'city', 'city': 'count'})
)

In [233]:
print(city_counts)

               city  count
0                     8125
1          New York   3404
2           Chicago   1836
3           Houston   1776
4           Atlanta   1607
...             ...    ...
6426           Wise      1
6427  Scotch Plains      1
6428    Bay Minette      1
6429      New Roads      1
6430      Lindstrom      1

[6431 rows x 2 columns]


In [234]:
state_counts = (
    df['state']
      .fillna('')
      .value_counts()
      .reset_index()
      .rename(columns={'index': 'state', 'state': 'count'})
)

In [235]:
print(state_counts)

   state  count
0         17266
1     CA  11700
2     TX  10666
3     NY   6187
4     FL   5961
5     NC   4954
6     IL   4494
7     PA   4158
8     VA   3678
9     OH   3574
10    MA   3531
11    GA   3430
12    NJ   3289
13    MI   2865
14    WA   2723
15    AZ   2524
16    CO   2337
17    MD   1977
18    MO   1947
19    TN   1887
20    WI   1860
21    MN   1851
22    IN   1818
23    SC   1625
24    OR   1262
25    CT   1191
26    KY   1179
27    LA   1117
28    AL   1028
29    DC    995
30    IA    995
31    UT    975
32    KS    947
33    NV    911
34    OK    798
35    AR    668
36    NE    607
37    NH    559
38    NM    502
39    HI    432
40    WV    416
41    ID    413
42    MS    387
43    ME    384
44    DE    322
45    RI    306
46    MT    236
47    ND    235
48    AK    209
49    VT    181
50    SD    167
51    WY    125


In [236]:
country_counts = (
    df['country']
      .fillna('')
      .value_counts()
      .reset_index()
      .rename(columns={'index': 'country', 'country': 'count'})
)

In [None]:
print(country_counts)

         country   count
0  United States  118605
1                   5240
2             ON       1
3    Netherlands       1
4             QC       1
5     The Gambia       1


: 