In [2]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://root:@localhost/etl_demo")

df_raw = pd.read_csv("employees_1000.csv")


In [3]:
df_raw.to_sql(
    name="employees_raw",
    con=engine,
    if_exists="replace",   # first load only
    index=False
)


1000

# Create the CLEAN / ANALYTICS table
DROP TABLE IF EXISTS employees_extended;

CREATE TABLE employees_extended AS
SELECT
    name,

    /* Fill missing age with global average */
    COALESCE(
        age,
        (SELECT AVG(age) FROM employees_raw)
    ) AS age,

    /* Fill missing salary with global average */
    COALESCE(
        salary,
        (SELECT AVG(salary) FROM employees_raw)
    ) AS salary,

    /* Handle missing categorical values */
    COALESCE(city, 'Unknown') AS city,
    COALESCE(department, 'Unknown') AS department,

    /* Handle missing dates */
    COALESCE(hire_date, '2015-01-01') AS hire_date,

    /* Derived feature */
    TIMESTAMPDIFF(
        DAY,
        COALESCE(hire_date, '2015-01-01'),
        CURDATE()
    ) / 365.25 AS years_of_service

FROM employees_raw;


# Create  BI table
CREATE TABLE employees_analytics AS
SELECT
    department,
    city,
    COUNT(*) AS employee_count,
    AVG(salary) AS avg_salary,
    AVG(years_of_service) AS avg_tenure
FROM employees_extended
GROUP BY department, city;


In [6]:
df_extended = pd.read_sql(
    "SELECT * FROM employees_extended LIMIT 20",
    engine
)

df_extended


Unnamed: 0,name,age,salary,city,department,hire_date,years_of_service
0,Person1,58.0,74327.0,Bergen,Unknown,2017-08-15,8.3943
1,Person2,48.0,98904.0,Tromsø,Support,2019-12-24,6.037
2,Person3,34.0,33797.0,Oslo,Marketing,2024-05-08,1.6646
3,Person4,27.0,77882.0,Bergen,Sales,2016-11-05,9.1691
4,Person5,40.0,43718.0,Unknown,Unknown,2019-12-25,6.0342
5,Person6,58.0,64560.0,Bergen,IT,2017-03-30,8.7721
6,Person7,39.945006,74248.210177,Stavanger,Unknown,2016-06-07,9.5825
7,Person8,42.0,52116.0,Stavanger,Unknown,2015-04-17,10.7242
8,Person9,30.0,66244.0,Stavanger,HR,2019-01-08,6.9952
9,Person10,30.0,31645.0,Kristiansand,Marketing,2014-10-24,11.2033


In [7]:
df_analytics = pd.read_sql(
    "SELECT * FROM employees_analytics",
    engine
)

df_analytics


Unnamed: 0,department,city,employee_count,avg_salary,avg_tenure
0,Engineering,Bergen,26,64514.716559,8.999315
1,Engineering,Kristiansand,27,75637.475582,7.034452
2,Engineering,Oslo,21,77023.581437,10.036971
3,Engineering,Stavanger,17,74009.764706,7.251782
4,Engineering,Tromsø,16,80031.401272,10.274981
5,Engineering,Trondheim,18,76714.602262,7.42795
6,Engineering,Unknown,13,77280.0,9.015108
7,Finance,Bergen,21,77580.31574,9.88899
8,Finance,Kristiansand,24,71014.567939,7.926538
9,Finance,Oslo,21,77148.696692,9.334257


In [8]:
print("RAW → EXTENDED → ANALYTICS")

print("\nRAW sample:")
display(pd.read_sql("SELECT * FROM employees_raw LIMIT 5", engine))

print("\nEXTENDED sample:")
display(pd.read_sql("SELECT * FROM employees_extended LIMIT 5", engine))

print("\nANALYTICS:")
display(pd.read_sql("SELECT * FROM employees_analytics", engine))


RAW → EXTENDED → ANALYTICS

RAW sample:


Unnamed: 0,name,age,salary,city,department,hire_date
0,Person1,58.0,74327.0,Bergen,,2017-08-15
1,Person2,48.0,98904.0,Tromsø,Support,2019-12-24
2,Person3,34.0,33797.0,Oslo,Marketing,2024-05-08
3,Person4,27.0,77882.0,Bergen,Sales,2016-11-05
4,Person5,40.0,43718.0,,,2019-12-25



EXTENDED sample:


Unnamed: 0,name,age,salary,city,department,hire_date,years_of_service
0,Person1,58.0,74327.0,Bergen,Unknown,2017-08-15,8.3943
1,Person2,48.0,98904.0,Tromsø,Support,2019-12-24,6.037
2,Person3,34.0,33797.0,Oslo,Marketing,2024-05-08,1.6646
3,Person4,27.0,77882.0,Bergen,Sales,2016-11-05,9.1691
4,Person5,40.0,43718.0,Unknown,Unknown,2019-12-25,6.0342



ANALYTICS:


Unnamed: 0,department,city,employee_count,avg_salary,avg_tenure
0,Engineering,Bergen,26,64514.716559,8.999315
1,Engineering,Kristiansand,27,75637.475582,7.034452
2,Engineering,Oslo,21,77023.581437,10.036971
3,Engineering,Stavanger,17,74009.764706,7.251782
4,Engineering,Tromsø,16,80031.401272,10.274981
5,Engineering,Trondheim,18,76714.602262,7.42795
6,Engineering,Unknown,13,77280.0,9.015108
7,Finance,Bergen,21,77580.31574,9.88899
8,Finance,Kristiansand,24,71014.567939,7.926538
9,Finance,Oslo,21,77148.696692,9.334257
