In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine, text
from sqlalchemy.exc import ProgrammingError

In [2]:
companies_df = pd.read_csv(r'data\companies.csv', sep=';')
employees_df = pd.read_csv(r'data\employees.csv', sep=';')
functions_df = pd.read_csv(r'data\functions.csv', sep=';')
salaries_df = pd.read_csv(r'data\salaries.csv', sep=';')


In [3]:
companies_df.head()

Unnamed: 0,company_name,company_city,company_state,company_type,const_site_category
0,DM Company Head Quarters,Goiania,GOIAS,Administration,
1,DM Company Development Center,Goiania,GOIAS,Development Center,
2,The Haven at Rocky Ridge,Anapolis,GOIAS,Construction Site s,Residential
3,DM Company RH Center,Goiania,GOIAS,Administration,
4,DM Company Administration Center,Goiania,GOIAS,Administration,


In [4]:
employees_df.head()

Unnamed: 0,comp_code_emp,employee_code_emp,employee_name_emp,GEN(M_F),age
0,1,9875,Dawson Crawford,M,25
1,ELMO,206,Bryce Bennett,F,28
2,83,29417,Robert Lewis,M,24
3,83,18879,Terence Buck,M,35
4,83,18883,Kevin Walker,M,35


In [5]:
functions_df.head()

Unnamed: 0,function_code,function,function_group
0,25,Lawyer,Managers
1,202,Tax and Accounting Analyst,Administration
2,109,Senior Payroll Department Analyst,Administration
3,229,Engineering Analyst,Engineering
4,238,Financial Analyst,Administration


In [6]:
salaries_df.head()

Unnamed: 0,comp_code,comp_name,employee_id,employee_name,date,func_code,func,salary
0,83,DM Company Head Quarters,26193,Jacob Smith,01/01/2022 00:00,25,Lawyer,633556
1,83,DM Company Head Quarters,25322,Michael Johnson,01/01/2022 00:00,202,Tax and Accounting Analyst,261935
2,83,DM Company Head Quarters,27602,Matthew Williams,01/01/2022 00:00,109,Senior Payroll Department Analyst,122167
3,83,DM Company Head Quarters,27127,Joshua Brown,01/01/2022 00:00,13,Engineering Assistant,4000
4,83,DM Company Head Quarters,23007,Christopher Jones,01/01/2022 00:00,238,Financial Analyst,301252


In [7]:
# Create an SQLite engine
engine = create_engine('sqlite:///employee_data.db')

# Save dataframes to SQL
companies_df.to_sql('companies', engine, index=False, if_exists='replace')
employees_df.to_sql('employees', engine, index=False, if_exists='replace')
functions_df.to_sql('functions', engine, index=False, if_exists='replace')
salaries_df.to_sql('salaries', engine, index=False, if_exists='replace')

8049

In [8]:
display(pd.read_sql('SELECT * FROM employees\
            Limit 5;'
            , con=engine.connect()))

display(pd.read_sql('SELECT * FROM companies\
            Limit 5;'
            , con=engine.connect()))

display(pd.read_sql('SELECT * FROM functions\
            Limit 5;'
            , con=engine.connect()))

display(pd.read_sql('SELECT * FROM salaries\
            Limit 5;'
            , con=engine.connect()))

Unnamed: 0,comp_code_emp,employee_code_emp,employee_name_emp,GEN(M_F),age
0,1,9875,Dawson Crawford,M,25
1,ELMO,206,Bryce Bennett,F,28
2,83,29417,Robert Lewis,M,24
3,83,18879,Terence Buck,M,35
4,83,18883,Kevin Walker,M,35


Unnamed: 0,company_name,company_city,company_state,company_type,const_site_category
0,DM Company Head Quarters,Goiania,GOIAS,Administration,
1,DM Company Development Center,Goiania,GOIAS,Development Center,
2,The Haven at Rocky Ridge,Anapolis,GOIAS,Construction Site s,Residential
3,DM Company RH Center,Goiania,GOIAS,Administration,
4,DM Company Administration Center,Goiania,GOIAS,Administration,


Unnamed: 0,function_code,function,function_group
0,25,Lawyer,Managers
1,202,Tax and Accounting Analyst,Administration
2,109,Senior Payroll Department Analyst,Administration
3,229,Engineering Analyst,Engineering
4,238,Financial Analyst,Administration


Unnamed: 0,comp_code,comp_name,employee_id,employee_name,date,func_code,func,salary
0,83,DM Company Head Quarters,26193,Jacob Smith,01/01/2022 00:00,25,Lawyer,633556
1,83,DM Company Head Quarters,25322,Michael Johnson,01/01/2022 00:00,202,Tax and Accounting Analyst,261935
2,83,DM Company Head Quarters,27602,Matthew Williams,01/01/2022 00:00,109,Senior Payroll Department Analyst,122167
3,83,DM Company Head Quarters,27127,Joshua Brown,01/01/2022 00:00,13,Engineering Assistant,4000
4,83,DM Company Head Quarters,23007,Christopher Jones,01/01/2022 00:00,238,Financial Analyst,301252


# Exercise 1: Building a Comprehensive Dataset for Employee Analysis

In [9]:
# SQL query to create the combined table
query = """

CREATE TABLE consolidated_data AS 
    SELECT
        s.employee_id,
        DATE(STRFTIME('%Y-%m-%d', SUBSTR(s.date, 7, 4) || '-' || SUBSTR(s.date, 4, 2) || '-' || SUBSTR(s.date, 1, 2))) AS formatted_date,
        s.salary,
        s.func_code,
        s.comp_code,
        e.employee_name_emp AS employee_name,
        e.`GEN(M_F)` AS gender,
        e.age,
        f.function,
        f.function_group,
        c.company_name,
        c.company_city,
        c.company_state,
        c.company_type,
        c.const_site_category
    FROM salaries s
    LEFT JOIN employees e ON s.employee_id = e.employee_code_emp
    LEFT JOIN functions f ON s.func_code = f.function_code
    LEFT JOIN companies c ON s.comp_name = c.company_name;
"""

# Execute the query to create the new table
with engine.connect() as connection:
    connection.execute(text('DROP TABLE IF EXISTS consolidated_data;'))
    connection.execute(text(query))

# Load the new table into a Pandas DataFrame

df = pd.read_sql('SELECT * FROM consolidated_data', engine)

df

Unnamed: 0,employee_id,formatted_date,salary,func_code,comp_code,employee_name,gender,age,function,function_group,company_name,company_city,company_state,company_type,const_site_category
0,26193,2022-01-01,633556,25,83,Jacob Smith,M,38,Lawyer,Managers,DM Company Head Quarters,Goiania,GOIAS,Administration,
1,25322,2022-01-01,261935,202,83,Michael Johnson,M,30,Tax and Accounting Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
2,27602,2022-01-01,122167,109,83,Matthew Williams,M,28,Senior Payroll Department Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
3,27127,2022-01-01,4000,13,83,Joshua Brown,M,25,Engineering Assistant,Engineering,DM Company Head Quarters,Goiania,GOIAS,Administration,
4,23007,2022-01-01,301252,238,83,Christopher Jones,M,25,Financial Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8044,32436,2023-01-01,185416,3,1,Ana Stewart,F,34,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential
8045,28178,2023-01-01,183182,3,1,Rosa Richards,F,20,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential
8046,28807,2023-01-01,1870,3,1,Shawn Cole,M,34,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential
8047,33084,2023-01-01,196751,3,1,Alaina Willis,F,30,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential


# Exercise 2: Cleaning Data for Consistency and Quality

In [10]:
# 2. Remove all unwanted spaces from all text columns using TRIM

trim_query = """
UPDATE consolidated_data
SET
    employee_id = TRIM(employee_id),
    formatted_date = TRIM(formatted_date),
    func_code = TRIM(func_code),
    comp_code = TRIM(comp_code),
    employee_name = TRIM(employee_name),
    gender = TRIM(gender),
    function = TRIM(function),
    function_group = TRIM(function_group),
    company_name = TRIM(company_name),
    company_city = TRIM(company_city),
    company_state = TRIM(company_state),
    company_type = TRIM(company_type),
    const_site_category = TRIM(const_site_category);
"""

with engine.connect() as connection:
    with connection.begin():
        connection.execute(text(trim_query))

pd.read_sql('SELECT * FROM consolidated_data', engine)

Unnamed: 0,employee_id,formatted_date,salary,func_code,comp_code,employee_name,gender,age,function,function_group,company_name,company_city,company_state,company_type,const_site_category
0,26193,2022-01-01,633556,25,83,Jacob Smith,M,38,Lawyer,Managers,DM Company Head Quarters,Goiania,GOIAS,Administration,
1,25322,2022-01-01,261935,202,83,Michael Johnson,M,30,Tax and Accounting Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
2,27602,2022-01-01,122167,109,83,Matthew Williams,M,28,Senior Payroll Department Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
3,27127,2022-01-01,4000,13,83,Joshua Brown,M,25,Engineering Assistant,Engineering,DM Company Head Quarters,Goiania,GOIAS,Administration,
4,23007,2022-01-01,301252,238,83,Christopher Jones,M,25,Financial Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8044,32436,2023-01-01,185416,3,1,Ana Stewart,F,34,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential
8045,28178,2023-01-01,183182,3,1,Rosa Richards,F,20,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential
8046,28807,2023-01-01,1870,3,1,Shawn Cole,M,34,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential
8047,33084,2023-01-01,196751,3,1,Alaina Willis,F,30,Carpenter,Professionals,The Sanctuary at Briarcliff,Brasilia,Distrito Federal,Construction Site,Residential


In [11]:
# 3. Check for NULL values and empty values.

check_null_query = """
SELECT *
FROM consolidated_data
WHERE employee_id IS NULL
OR formatted_date is NULL
OR salary IS NULL
OR func_code IS NULL
OR comp_code IS NULL
OR employee_name IS NULL
OR gender IS NULL
or age IS NULL
OR function IS NULL
OR function_group IS NULL
OR company_name IS NULL
OR company_city IS NULL
OR company_state IS NULL
OR company_type IS NULL
OR const_site_category IS NULL
;
"""

pd.read_sql(check_null_query, con=engine.connect())

Unnamed: 0,employee_id,formatted_date,salary,func_code,comp_code,employee_name,gender,age,function,function_group,company_name,company_city,company_state,company_type,const_site_category
0,26193,2022-01-01,633556,25,83,Jacob Smith,M,38,Lawyer,Managers,DM Company Head Quarters,Goiania,GOIAS,Administration,
1,25322,2022-01-01,261935,202,83,Michael Johnson,M,30,Tax and Accounting Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
2,27602,2022-01-01,122167,109,83,Matthew Williams,M,28,Senior Payroll Department Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
3,27127,2022-01-01,4000,13,83,Joshua Brown,M,25,Engineering Assistant,Engineering,DM Company Head Quarters,Goiania,GOIAS,Administration,
4,23007,2022-01-01,301252,238,83,Christopher Jones,M,25,Financial Analyst,Administration,DM Company Head Quarters,Goiania,GOIAS,Administration,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,30754,2023-01-01,,236,2,Izaiah Bond,M,27,Painter B,Professionals,The Glades at Maplewood,Goiania,GOIAS,Construction Site,Commercial
1057,24877,2023-01-01,,7,2,Zackary Matthews,M,28,General Services Assistant,Assistants,The Glades at Maplewood,Goiania,GOIAS,Construction Site,Commercial
1058,32729,2023-01-01,,7,2,Julie Henderson,F,23,General Services Assistant,Assistants,The Glades at Maplewood,Goiania,GOIAS,Construction Site,Commercial
1059,30111,2023-01-01,,7,2,Arturo Fuller,M,25,General Services Assistant,Assistants,The Glades at Maplewood,Goiania,GOIAS,Construction Site,Commercial


In [12]:
delete_missing_query = """
DELETE FROM consolidated_data
WHERE employee_id IS NULL
OR formatted_date is NULL
OR salary IS NULL
OR func_code IS NULL
OR comp_code IS NULL
OR employee_name IS NULL
OR gender IS NULL
or age IS NULL
OR function IS NULL
OR function_group IS NULL
OR company_name IS NULL
OR company_city IS NULL
OR company_state IS NULL
OR company_type IS NULL
OR const_site_category IS NULL
"""

with engine.connect() as connection:
    with connection.begin():  # Начало транзакции
        connection.execute(text(delete_missing_query))

pd.read_sql(check_null_query, con=engine.connect())

Unnamed: 0,employee_id,formatted_date,salary,func_code,comp_code,employee_name,gender,age,function,function_group,company_name,company_city,company_state,company_type,const_site_category


# Exercise 3 : Calculating Current Employee Counts by Company

In [13]:
# SQL query to calculate current employee counts by company
employee_count_query = """
SELECT 
    company_name,
    COUNT(DISTINCT employee_id) AS employee_count
FROM consolidated_data
GROUP BY company_name
ORDER BY employee_count DESC;
"""

# Execute the query and fetch the result into a Pandas DataFrame
with engine.connect() as connection:
    employee_counts = pd.read_sql(employee_count_query, connection)

employee_counts

Unnamed: 0,company_name,employee_count
0,The Crossings at Falcon Point,252
1,The Parkview at Golden Gate,169
2,The Pines at Windward,156
3,Regional Hospital,155
4,The Terraces at Diamond Heights,100
5,The Glades at Maplewood,91
6,The Meadows at Sunset Ridge,86
7,The Greens at Fairway Hills,68
8,The Sanctuary at Briarcliff,53
9,The Oasis at Desert Springs,43


# Exercise 4 : Analyzing Employee Distribution by City and Over Time
- What is the total number of employees each city? Add a percentage column
- What is the total number of employees each month?
- What is the average number of employees each month?

In [14]:
# What is the total number of employees each city? Add a percentage column


pd.read_sql('''
            SELECT 
                company_city,
                COUNT(DISTINCT employee_id) AS total_employees,
                ROUND(100.0 * COUNT(DISTINCT employee_id) / SUM(COUNT(DISTINCT employee_id)) OVER (), 2) AS percentage
            FROM consolidated_data
            GROUP BY company_city
            ORDER BY total_employees DESC;''', con=engine.connect())

Unnamed: 0,company_city,total_employees,percentage
0,Goiania,622,59.52
1,Brasilia,371,35.5
2,Palmas,50,4.78
3,Goianiaa,2,0.19


In [17]:
# What is the total number of employees each month?


pd.read_sql('''
            SELECT 
                strftime('%Y-%m', formatted_date) AS month,
                COUNT(DISTINCT employee_id) AS total_employees
            FROM consolidated_data
            GROUP BY month
            ORDER BY month;''', con=engine.connect())

Unnamed: 0,month,total_employees
0,2022-01,438
1,2022-02,447
2,2022-03,451
3,2022-04,527
4,2022-05,533
5,2022-06,550
6,2022-07,525
7,2022-08,539
8,2022-09,555
9,2022-10,513


# Exercise 5 : Analyzing Employment Trends and Salary Metrics
- What is the minimum and maximum number of employees throughout all the months? In which months were they?
- What is the monthly average number of employees by function group?
- What is the annual average salary?

In [21]:
pd.read_sql('''
WITH monthly_employee_counts AS (
    SELECT 
        STRFTIME('%Y-%m', formatted_date) AS month,
        COUNT(DISTINCT employee_id) AS total_employees
    FROM consolidated_data
    GROUP BY month
),
min_max_employees AS (
    SELECT 
        MIN(total_employees) AS min_employees,
        MAX(total_employees) AS max_employees
    FROM monthly_employee_counts
)
SELECT 
    mm.min_employees,
    mm.max_employees,
    mmin.month AS min_month,
    mmax.month AS max_month
FROM min_max_employees mm
LEFT JOIN monthly_employee_counts mmin ON mm.min_employees = mmin.total_employees
LEFT JOIN monthly_employee_counts mmax ON mm.max_employees = mmax.total_employees;

''', con=engine.connect())

Unnamed: 0,min_employees,max_employees,min_month,max_month
0,438,581,2022-01,2022-11


In [26]:
# What is the monthly average number of employees by function group?

df = pd.read_sql('''
SELECT 
    STRFTIME('%Y-%m', formatted_date) AS month,  -- Extract year and month
    function_group,
    COUNT(DISTINCT employee_id) AS total_employees
FROM consolidated_data
GROUP BY month, function_group
ORDER BY month;
''', con=engine.connect())

df

Unnamed: 0,month,function_group,total_employees
0,2022-01,Administration,12
1,2022-01,Assistants,160
2,2022-01,Engineering,9
3,2022-01,Machine Operators,18
4,2022-01,Production Supervisors,23
...,...,...,...
86,2023-01,Engineering,7
87,2023-01,Machine Operators,18
88,2023-01,Production Supervisors,30
89,2023-01,Professionals,260


In [None]:
# What is the annual average salary?

pd.read_sql('''
SELECT 
    STRFTIME('%Y', formatted_date) AS year,  -- Extract the year from the date
    AVG(CAST(REPLACE(salary, ',', '.') AS REAL)) AS avg_salary  -- Convert salary to numeric and calculate average
FROM consolidated_data
GROUP BY year
ORDER BY year;
''', con=engine.connect())


Unnamed: 0,year,avg_salary
0,2022,1602.015782
1,2023,14991.707198
