## **PROJECT - NOTEBOOK #1: Raw Data**

> 🚧 Run this notebook only once to migrate the data to your DB (It will take a few minutes)

This notebook sets up our data pipeline by configuring the environment, importing essential libraries and create a SQLAlchemy engine, then loading raw data from the CSV file into a Pandas DataFrame, transferring this data into a MySQL database, and verifying the transfer with a simple query.

---

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [4]:
df_postings = pd.read_csv('../data/postings.csv')

In [5]:
df_postings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29793 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   views                       122160 non-null  float64
 9   med_salary                  6280 non-null    float64
 10  min_salary                  29793 non-null   float64
 11  formatted_work_type         123849 non-null  object 
 12  applies                     23320 non-null   float64
 13  original_liste

In [6]:
df_postings.describe()

Unnamed: 0,job_id,max_salary,company_id,views,med_salary,min_salary,applies,original_listed_time,remote_allowed,expiry,closed_time,listed_time,sponsored,normalized_salary,zip_code,fips
count,123849.0,29793.0,122132.0,122160.0,6280.0,29793.0,23320.0,123849.0,15246.0,123849.0,1073.0,123849.0,123849.0,36073.0,102977.0,96434.0
mean,3896402138.07,91939.42,12204012.34,14.62,22015.62,64910.85,10.59,1713152338799.94,1.0,1716213036608.15,1712927892086.67,1713204445996.68,0.0,205327.04,50400.49,28713.88
std,84043545.16,701110.14,25541431.66,85.9,52255.87,495973.79,29.05,484820878.41,0.0,2321393898.08,362289347.41,398912198.59,0.0,5097626.76,30252.23,16015.93
min,921716.0,1.0,1009.0,1.0,0.0,1.0,1.0,1701810533000.0,1.0,1712903448000.0,1712345932000.0,1711317014000.0,0.0,0.0,1001.0,1003.0
25%,3894586595.0,48.28,14352.0,3.0,18.94,37.0,1.0,1712862876000.0,1.0,1715480718000.0,1712669760000.0,1712885544000.0,0.0,52000.0,24112.0,13121.0
50%,3901998406.0,80000.0,226965.0,4.0,25.5,60000.0,3.0,1713395039000.0,1.0,1716042264000.0,1712669910000.0,1713407593000.0,0.0,81500.0,48059.0,29183.0
75%,3904707077.0,140000.0,8047188.0,8.0,2510.5,100000.0,8.0,1713478324000.0,1.0,1716088229000.0,1713282569000.0,1713483628000.0,0.0,125000.0,78201.0,42077.0
max,3906267224.0,120000000.0,103472979.0,9975.0,750000.0,85000000.0,967.0,1713572803000.0,1.0,1729124796000.0,1713562107000.0,1713572816000.0,0.0,535600000.0,99901.0,56045.0


In [7]:
df_postings.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713397508000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712857887000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713277614000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712895812000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713451943000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


In [8]:
missing_counts = df_postings.isna().sum()
missing_perc   = (missing_counts / len(df_postings) * 100).round(2)
pd.DataFrame({
    'missing_count': missing_counts,
    'missing_perc':   missing_perc
})

Unnamed: 0,missing_count,missing_perc
job_id,0,0.0
company_name,1719,1.39
title,0,0.0
description,7,0.01
max_salary,94056,75.94
pay_period,87776,70.87
location,0,0.0
company_id,1717,1.39
views,1689,1.36
med_salary,117569,94.93


In [9]:
percentiles = [0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]
cols = ['min_salary', 'max_salary', 'med_salary', 'normalized_salary', 'views', 'applies']
df_postings[cols].quantile(percentiles)

Unnamed: 0,min_salary,max_salary,med_salary,normalized_salary,views,applies
0.01,13.92,16.0,12.0,130.0,1.0,1.0
0.05,16.7,20.0,15.0,33280.0,2.0,1.0
0.25,37.0,48.28,18.94,52000.0,3.0,1.0
0.5,60000.0,80000.0,25.5,81500.0,4.0,3.0
0.75,100000.0,140000.0,2510.5,125000.0,8.0,8.0
0.95,166080.0,245000.0,120000.0,200000.0,48.0,45.0
0.99,247696.8,352148.0,225000.0,300808.0,177.0,133.81


In [10]:
print("Duplicated:", df_postings.duplicated().sum())

Duplicated: 0


In [11]:
cat_cols = [
    'pay_period',
    'formatted_work_type',
    'application_type',
    'work_type',
    'formatted_experience_level',
    'currency',
    'compensation_type'
]

for col in cat_cols:
    print(f"=== {col} ===")
    print(df_postings[col].value_counts(dropna=False).head(10))
    print()

=== pay_period ===
NaN         87776
YEARLY      20628
HOURLY      14741
MONTHLY       518
WEEKLY        177
BIWEEKLY        9
Name: pay_period, dtype: int64

=== formatted_work_type ===
Full-time     98814
Contract      12117
Part-time      9696
Temporary      1190
Internship      983
Volunteer       562
Other           487
Name: formatted_work_type, dtype: int64

=== application_type ===
OffsiteApply          84607
ComplexOnsiteApply    31049
SimpleOnsiteApply      8192
UnknownApply              1
Name: application_type, dtype: int64

=== work_type ===
FULL_TIME     98814
CONTRACT      12117
PART_TIME      9696
TEMPORARY      1190
INTERNSHIP      983
VOLUNTEER       562
OTHER           487
Name: work_type, dtype: int64

=== formatted_experience_level ===
Mid-Senior level    41489
Entry level         36708
NaN                 29409
Associate            9826
Director             3746
Internship           1449
Executive            1222
Name: formatted_experience_level, dtype: int64

===

In [12]:
top_locations = df_postings['location'].value_counts().head(20)
print("=== Top 20 locations ===")
print(top_locations)

# Remote allowed
remote_counts = df_postings['remote_allowed'].value_counts(dropna=False)
remote_pct = (remote_counts.get(1.0, 0) / len(df_postings) * 100).round(2)
print("\n=== Remote allowed counts ===")
print(remote_counts)
print(f"\nRemote allowed % of total postings: {remote_pct}%")


=== Top 20 locations ===
United States                      8125
New York, NY                       2756
Chicago, IL                        1834
Houston, TX                        1762
Dallas, TX                         1383
Atlanta, GA                        1363
Boston, MA                         1176
Austin, TX                         1083
Charlotte, NC                      1075
Phoenix, AZ                        1059
Washington, DC                      985
Los Angeles, CA                     972
San Francisco, CA                   884
New York City Metropolitan Area     837
Seattle, WA                         818
San Diego, CA                       790
Denver, CO                          787
Philadelphia, PA                    711
Tampa, FL                           659
Miami, FL                           643
Name: location, dtype: int64

=== Remote allowed counts ===
NaN     108603
1.00     15246
Name: remote_allowed, dtype: int64

Remote allowed % of total postings: 12.31%


In [13]:
df_benefits = pd.read_csv('../data/jobs/benefits.csv')
df_benefits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67943 entries, 0 to 67942
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   job_id    67943 non-null  int64 
 1   inferred  67943 non-null  int64 
 2   type      67943 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.6+ MB


In [14]:
print("Duplicados:", df_benefits.duplicated().sum())

Duplicados: 0


In [15]:
df_benefits.describe()

Unnamed: 0,job_id,inferred
count,67943.0,67943.0
mean,3896219574.65,0.59
std,98172917.62,0.49
min,23221523.0,0.0
25%,3898160860.0,0.0
50%,3902347593.0,1.0
75%,3904718586.5,1.0
max,3906267117.0,1.0


In [16]:
df_benefits.isna().sum()

job_id      0
inferred    0
type        0
dtype: int64

In [17]:
df_job_industries = pd.read_csv('../data/jobs/job_industries.csv')
df_job_industries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164808 entries, 0 to 164807
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   job_id       164808 non-null  int64
 1   industry_id  164808 non-null  int64
dtypes: int64(2)
memory usage: 2.5 MB


In [18]:
print("Duplicados:", df_job_industries.duplicated().sum())

Duplicados: 0


In [19]:
df_job_industries.describe()

Unnamed: 0,job_id,industry_id
count,164808.0,164808.0
mean,3897074144.95,196.16
std,76249300.5,594.23
min,921716.0,1.0
25%,3894876310.0,17.0
50%,3902342291.5,44.0
75%,3904718771.25,96.0
max,3906267224.0,3253.0


In [20]:
df_job_industries.isna().sum()

job_id         0
industry_id    0
dtype: int64

In [21]:
df_job_skills = pd.read_csv('../data/jobs/job_skills.csv')
df_job_skills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213768 entries, 0 to 213767
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   job_id     213768 non-null  int64 
 1   skill_abr  213768 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.3+ MB


In [22]:
print("Duplicados:", df_job_skills.duplicated().sum())

Duplicados: 0


In [23]:
df_job_skills.describe()

Unnamed: 0,job_id
count,213768.0
mean,3896849267.04
std,78349015.8
min,921716.0
25%,3894660713.25
50%,3902323142.5
75%,3904715490.0
max,3906267224.0


In [24]:
df_job_skills.isna().sum()

job_id       0
skill_abr    0
dtype: int64

In [25]:
df_salaries = pd.read_csv('../data/jobs/salaries.csv')
df_salaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40785 entries, 0 to 40784
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   salary_id          40785 non-null  int64  
 1   job_id             40785 non-null  int64  
 2   max_salary         33947 non-null  float64
 3   med_salary         6838 non-null   float64
 4   min_salary         33947 non-null  float64
 5   pay_period         40785 non-null  object 
 6   currency           40785 non-null  object 
 7   compensation_type  40785 non-null  object 
dtypes: float64(3), int64(2), object(3)
memory usage: 2.5+ MB


In [26]:
df_salaries.describe()

Unnamed: 0,salary_id,job_id,max_salary,med_salary,min_salary
count,40785.0,40785.0,33947.0,6838.0,33947.0
mean,20393.0,3895563848.87,96209.87,21370.3,65085.41
std,11773.76,94966718.0,658737.34,51338.56,465061.24
min,1.0,921716.0,1.0,0.0,1.0
25%,10197.0,3894608085.0,50.0,18.5,39.0
50%,20393.0,3901980104.0,85000.0,25.0,62300.0
75%,30589.0,3904576109.0,142500.0,2207.0,100000.0
max,40785.0,3906267224.0,120000000.0,750000.0,85000000.0


In [27]:
df_salaries.isna().sum()

salary_id                0
job_id                   0
max_salary            6838
med_salary           33947
min_salary            6838
pay_period               0
currency                 0
compensation_type        0
dtype: int64

In [28]:
n_rows       = len(df_salaries)
n_unique_sid = df_salaries['salary_id'].nunique()
print(f"Rows: {n_rows}, Unique salary_id: {n_unique_sid}")


Rows: 40785, Unique salary_id: 40785


In [29]:
df_companies = pd.read_csv('../data/companies/companies.csv')
df_companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24473 entries, 0 to 24472
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   company_id    24473 non-null  int64  
 1   name          24473 non-null  object 
 2   description   24177 non-null  object 
 3   company_size  21699 non-null  float64
 4   state         24451 non-null  object 
 5   country       24473 non-null  object 
 6   city          24472 non-null  object 
 7   zip_code      24445 non-null  object 
 8   address       24451 non-null  object 
 9   url           24473 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 1.9+ MB


In [30]:
print("Duplicados:", df_companies.duplicated().sum())

Duplicados: 0


In [31]:
df_companies.describe()

Unnamed: 0,company_id,company_size
count,24473.0,21699.0
mean,20522391.62,3.35
std,31659289.15,1.9
min,1009.0,1.0
25%,165404.0,2.0
50%,2738154.0,3.0
75%,26241420.0,5.0
max,103472979.0,7.0


In [32]:
df_companies.isna().sum()

company_id         0
name               0
description      296
company_size    2774
state             22
country            0
city               1
zip_code          28
address           22
url                0
dtype: int64

In [33]:
print("Duplicados:", df_companies.duplicated().sum())

Duplicados: 0


In [34]:
df_emp_counts = pd.read_csv('../data/companies/employee_counts.csv')
df_emp_counts.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35787 entries, 0 to 35786
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   company_id      35787 non-null  int64
 1   employee_count  35787 non-null  int64
 2   follower_count  35787 non-null  int64
 3   time_recorded   35787 non-null  int64
dtypes: int64(4)
memory usage: 1.1 MB


In [35]:
df_emp_counts.describe()

Unnamed: 0,company_id,employee_count,follower_count,time_recorded
count,35787.0,35787.0,35787.0,35787.0
mean,16682540.69,6715.87,201261.63,1713163272.62
std,29247215.54,29400.98,1114732.82,399086.9
min,1009.0,0.0,0.0,1712346173.0
25%,60596.5,56.0,2738.0,1712861427.0
50%,1339209.0,418.0,16178.0,1713393030.0
75%,15440915.0,2945.0,74129.5,1713471600.0
max,103472979.0,751125.0,32702835.0,1713572859.0


In [36]:
df_emp_counts.isna().sum()

company_id        0
employee_count    0
follower_count    0
time_recorded     0
dtype: int64

In [37]:
print("Duplicados:", df_emp_counts.duplicated().sum())

Duplicados: 0


In [38]:
df_industries = pd.read_csv('../data/mappings/industries.csv')
df_industries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   industry_id    422 non-null    int64 
 1   industry_name  388 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.7+ KB


In [39]:
print("Duplicados:", df_industries.duplicated().sum())

Duplicados: 0


In [40]:
df_industries.describe()

Unnamed: 0,industry_id
count,422.0
mean,1342.31
std,1212.02
min,1.0
25%,108.25
50%,1161.5
75%,2279.5
max,3253.0


In [41]:
df_industries.isna().sum()

industry_id       0
industry_name    34
dtype: int64

In [42]:
df_skills = pd.read_csv('../data/mappings/skills.csv')
df_skills.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   skill_abr   35 non-null     object
 1   skill_name  35 non-null     object
dtypes: object(2)
memory usage: 692.0+ bytes


In [43]:
df_skills.describe()

Unnamed: 0,skill_abr,skill_name
count,35,35
unique,35,35
top,ART,Art/Creative
freq,1,1


In [44]:
df_skills.isna().sum()

skill_abr     0
skill_name    0
dtype: int64

In [45]:
dfs = {
    'postings': df_postings,
    'benefits': df_benefits,
    'job_industries': df_job_industries,
    'job_skills': df_job_skills,
    'salaries': df_salaries
}

total_postings = df_postings['job_id'].nunique()
coverage = []
for name, df in dfs.items():
    if 'job_id' in df.columns:
        n_records = len(df)
        n_unique = df['job_id'].nunique()
        pct = (n_unique / total_postings * 100)
        coverage.append({
            'dataset':      name,
            'rows':         n_records,
            'unique_job_id':n_unique,
            '% of postings': pct
        })

print(pd.DataFrame(coverage))

          dataset    rows  unique_job_id  % of postings
0        postings  123849         123849         100.00
1        benefits   67943          30023          24.24
2  job_industries  164808         127125         102.65
3      job_skills  213768         126807         102.39
4        salaries   40785          40785          32.93


In [46]:
dfs_comp = {
    'postings': df_postings,
    'companies': df_companies,
    'employee_counts': df_emp_counts
}

total_companies = df_companies['company_id'].nunique()
comp_cov = []
for name, df in dfs_comp.items():
    n_records = len(df)
    n_unique = df['company_id'].nunique()
    pct = (n_unique / total_companies * 100)
    comp_cov.append({
        'dataset':         name,
        'rows':            n_records,
        'unique_company':  n_unique,
        '% of companies':  pct
    })

print(pd.DataFrame(comp_cov))

           dataset    rows  unique_company  % of companies
0         postings  123849           24474          100.00
1        companies   24473           24473          100.00
2  employee_counts   35787           24473          100.00


In [47]:
print("=== pay_period en postings vs salaries ===")
print(df_postings['pay_period'].value_counts(dropna=False))
print(df_salaries['pay_period'].value_counts(dropna=False))

print("\n=== currency en postings vs salaries ===")
print(df_postings['currency'].value_counts(dropna=False))
print(df_salaries['currency'].value_counts(dropna=False))

=== pay_period en postings vs salaries ===
NaN         87776
YEARLY      20628
HOURLY      14741
MONTHLY       518
WEEKLY        177
BIWEEKLY        9
Name: pay_period, dtype: int64
YEARLY      23768
HOURLY      16289
MONTHLY       539
WEEKLY        180
BIWEEKLY        9
Name: pay_period, dtype: int64

=== currency en postings vs salaries ===
NaN    87776
USD    36058
EUR        6
CAD        3
BBD        2
AUD        2
GBP        2
Name: currency, dtype: int64
USD    40770
EUR        6
CAD        3
BBD        2
AUD        2
GBP        2
Name: currency, dtype: int64


In [48]:
print("=== Unique industry_id en job_industries vs industries ===")
print("job_industries:", df_job_industries['industry_id'].nunique())
print("industries   :", df_industries['industry_id'].nunique())

print("\n=== Unique skill_abr en job_skills vs skills ===")
print("job_skills:", df_job_skills['skill_abr'].nunique())
print("skills    :", df_skills['skill_abr'].nunique())

=== Unique industry_id en job_industries vs industries ===
job_industries: 422
industries   : 422

=== Unique skill_abr en job_skills vs skills ===
job_skills: 35
skills    : 35


In [51]:
from sqlalchemy import create_engine

engine = create_engine('postgresql://root:root@localhost:5432/linkedin')

files = {
    'postings':       '../data/postings.csv',
    'benefits':       '../data/jobs/benefits.csv',
    'job_industries': '../data/jobs/job_industries.csv',
    'job_skills':     '../data/jobs/job_skills.csv',
    'salaries':       '../data/jobs/salaries.csv',
    'companies':      '../data/companies/companies.csv',
    'employee_counts':'../data/companies/employee_counts.csv',
    'industries':     '../data/mappings/industries.csv',
    'skills':         '../data/mappings/skills.csv'
}

for table, path in files.items():
    df = pd.read_csv(path)
    df.to_sql(
        name=table,
        con=engine,
        schema='raw',
        if_exists='replace',
        index=False
    )
    print(f"Cargado raw.{table}")


Cargado raw.postings
Cargado raw.benefits
Cargado raw.job_industries
Cargado raw.job_skills
Cargado raw.salaries
Cargado raw.companies
Cargado raw.employee_counts
Cargado raw.industries
Cargado raw.skills
