In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
from faker import Faker
import random

In [2]:
# Define the database URL
# db_url = "postgresql://postgres:postgres@localhost:5432/EmployeeManagement"
db_url = "postgresql://postgres:postgres@localhost:5432/DummyData"

In [3]:
# Create the engine object
engine = create_engine(db_url)


In [4]:
# Instantiate the Faker generator
fake = Faker()

In [5]:
# # Drop the existing tables

# with engine.connect() as connection:
#     connection.execute('DROP TABLE IF EXISTS contract_projects CASCADE')
#     connection.execute('DROP TABLE IF EXISTS work_locations CASCADE')
#     connection.execute('DROP TABLE IF EXISTS educations CASCADE')
#     connection.execute('DROP TABLE IF EXISTS clearances CASCADE')
#     connection.execute('DROP TABLE IF EXISTS employees CASCADE')
#     connection.execute('DROP TABLE IF EXISTS certifications CASCADE')

In [37]:
# Generate dummy data for contract projects
contract_projects = [{'contract_project_id': i+1, 'name': fake.job()} for i in range(10)]
contract_projects_df = pd.DataFrame(contract_projects)
contract_projects_df.to_sql('contract_projects', engine, if_exists='replace', index=False)

# Generate dummy data for work locations
work_locations = [{'work_location_id': i+1, 'name': fake.city()} for i in range(5)]
work_locations_df = pd.DataFrame(work_locations)
work_locations_df.to_sql('work_locations', engine, if_exists='replace', index=False)

# Generate dummy data for educations
educations = [{'education_id': i+1, 'level': level} for i, level in enumerate(["High School", "Bachelor's", "Master's", "Ph.D."])]
educations_df = pd.DataFrame(educations)
educations_df.to_sql('educations', engine, if_exists='replace', index=False)

# Generate dummy data for clearances
clearances = [{'clearance_id': i+1, 'level': level, 'status': status} for i, level in enumerate(["Confidential", "Secret", "Top Secret"])
              for status in ["Active", "Inactive"]]
clearances_df = pd.DataFrame(clearances)
clearances_df.to_sql('clearances', engine, if_exists='replace', index=False)

# Generate dummy data for certifications
certifications = [{'name': 'Certification 1', 'description': 'Description 1'},
                  {'name': 'Certification 2', 'description': 'Description 2'},
                  {'name': 'Certification 3', 'description': 'Description 3'},
                  # Add more certifications as needed
                 ]
certifications_df = pd.DataFrame(certifications)
certifications_df.to_sql('certifications', engine, if_exists='replace', index=False)

# Generate dummy data for employees
employees_data = []
for _ in range(150):
    contract_project_id = random.randint(1, len(contract_projects))
    work_location_id = random.randint(1, len(work_locations))
    years_of_experience = random.randint(0, 30)
    education_id = random.randint(1, len(educations))
    clearance_id = random.randint(1, len(clearances))
    origination_date = fake.date_between(start_date='-5y', end_date='today')
    reinvestigation_date = fake.date_between(start_date=origination_date, end_date='today')
    date_joined = fake.date_between(start_date='-5y', end_date='today')  # Random date joined
    terminate_date = fake.date_between(start_date=date_joined, end_date='today')  # Random terminate date
    name = fake.name()
    certification_id = random.randint(1, len(certifications))
    hubzone = random.choice(["Yes", "No"])  # Randomly assign "Yes" or "No" to HubZone column
    veterans = random.choice(["NA", "Army", "Navy", "Air Force", "Marines", "Coast Guard"])  # Randomly assign a branch or "NA" to veterans
    supervisor = fake.name()  # Random supervisor name
    job_title = fake.job()  # Random job title
    
    employees_data.append({
        'name': name,
        'contract_project_id': contract_project_id,
        'work_location_id': work_location_id,
        'years_of_experience': years_of_experience,
        'education_id': education_id,
        'clearance_id': clearance_id,
        'origination_date': origination_date,
        'reinvestigation_date': reinvestigation_date,
        'date_joined': date_joined,
        'terminate_date': terminate_date,
        'certification_id': certification_id,
        'hubzone': hubzone,
        'veterans': veterans,
        'supervisor': supervisor,
        'job_title': job_title
    })

employees_df = pd.DataFrame(employees_data)

# Add foreign key columns from related tables
employees_df['clearance_id'] = employees_df['clearance_id'].apply(lambda x: clearances_df.loc[x-1, 'clearance_id'])
employees_df['contract_project_id'] = employees_df['contract_project_id'].apply(lambda x: contract_projects_df.loc[x-1, 'contract_project_id'])
employees_df['education_id'] = employees_df['education_id'].apply(lambda x: educations_df.loc[x-1, 'education_id'])
employees_df['work_location_id'] = employees_df['work_location_id'].apply(lambda x: work_locations_df.loc[x-1, 'work_location_id'])
employees_df['certification_id'] = employees_df['certification_id'].apply(lambda x: certifications_df.loc[x-1, 'name'])

employees_df.to_sql('employees', engine, if_exists='replace', index=False)

print("Dummy data inserted successfully.")

Dummy data inserted successfully.


In [38]:
# Join employees table with related tables to retrieve the desired columns
query = text('''
    SELECT e.name AS employee_name,
           e.date_joined,
           e.terminate_date,
           e.veterans,
           e.supervisor,
           e.job_title,
           cp.name AS contract_project_name,
           wl.name AS work_location_name,
           e.years_of_experience,
           ed.level AS education_level,
           cl.level AS clearance_level,
           e.origination_date,
           e.reinvestigation_date,
           c.name AS certification_name,
           e.hubzone
    FROM employees e
    INNER JOIN contract_projects cp ON e.contract_project_id = cp.contract_project_id
    INNER JOIN work_locations wl ON e.work_location_id = wl.work_location_id
    INNER JOIN educations ed ON e.education_id = ed.education_id
    INNER JOIN clearances cl ON e.clearance_id = cl.clearance_id
    INNER JOIN certifications c ON e.certification_id = c.name
''')

# Establish a connection
with engine.connect() as connection:
    # Execute the query and fetch the results
    result = connection.execute(query)

    # Convert the result to a Pandas DataFrame
    result_df = pd.DataFrame(result.fetchall(), columns=[
        'Employee Name',
        'Date Joined',
        'Terminate Date',
        'Veterans',
        'Supervisor',
        'Job Title',
        'Contract Project',
        'Work Location',
        'Years of Experience',
        'Education Level',
        'Clearance Level',
        'Origination Date',
        'Reinvestigation Date',
        'Certification Name',
        'HubZone'
    ])

# Drop duplicate rows from the result_df DataFrame
result_df = result_df.drop_duplicates()

# Reset index
result_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(result_df)

# Create a new table in the database with the result_df DataFrame
result_df.to_sql('employees_data', engine, if_exists='replace', index=False)

print("New table created and added to the database.")

            Employee Name Date Joined Terminate Date     Veterans  \
0        Nicole Mcfarland  2020-02-08     2020-02-20      Marines   
1             Morgan Cook  2023-03-05     2023-03-11           NA   
2             Sarah Smith  2022-04-05     2022-12-06  Coast Guard   
3           Sherry Molina  2020-11-02     2021-10-05           NA   
4             Renee Moore  2020-05-06     2020-07-15    Air Force   
..                    ...         ...            ...          ...   
145         Lindsay Dixon  2019-02-18     2020-10-24         Army   
146           Cindy Gomez  2018-09-14     2020-01-05         Navy   
147         Wayne Johnson  2020-02-06     2020-03-13  Coast Guard   
148         Ethan Stewart  2023-06-02     2023-06-26      Marines   
149  Alexander Ortega DVM  2022-01-18     2023-07-01         Navy   

           Supervisor                          Job Title  \
0         Monica Reid                 Charity fundraiser   
1       Tony Mitchell             Occupational hygi

In [39]:
# Create a new table in the database with the result_df DataFrame
result_df.to_sql('employeemanagementdb', engine, if_exists='replace', index=False)

print("New table created and added to the database.")

New table created and added to the database.


In [40]:
# Retrieve EmployeeManagementDB table:

# Query all employee names
query = "SELECT * FROM employeemanagementdb"
EmployeeManagementDB_df = pd.read_sql(query, engine)

# Display the result
EmployeeManagementDB_df

Unnamed: 0,Employee Name,Date Joined,Terminate Date,Veterans,Supervisor,Job Title,Contract Project,Work Location,Years of Experience,Education Level,Clearance Level,Origination Date,Reinvestigation Date,Certification Name,HubZone
0,Nicole Mcfarland,2020-02-08,2020-02-20,Marines,Monica Reid,Charity fundraiser,Animator,New Jamesfurt,24,Ph.D.,Confidential,2023-03-22,2023-07-02,Certification 1,Yes
1,Morgan Cook,2023-03-05,2023-03-11,,Tony Mitchell,Occupational hygienist,Animator,East Amyland,7,High School,Top Secret,2018-09-22,2021-07-27,Certification 1,Yes
2,Sarah Smith,2022-04-05,2022-12-06,Coast Guard,Sean Clark,Horticultural therapist,Animator,East Amyland,19,High School,Top Secret,2021-07-21,2022-11-25,Certification 1,No
3,Sherry Molina,2020-11-02,2021-10-05,,Scott Mejia,Chartered accountant,Animator,Jenkinsfurt,25,Bachelor's,Secret,2021-03-02,2023-06-08,Certification 3,No
4,Renee Moore,2020-05-06,2020-07-15,Air Force,Daniel Marsh,Interpreter,Animator,East Amyland,29,High School,Confidential,2019-06-01,2020-02-29,Certification 1,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,Lindsay Dixon,2019-02-18,2020-10-24,Army,Bridget Powell,Radiation protection practitioner,Forest/woodland manager,Jenkinsfurt,2,High School,Top Secret,2022-06-02,2023-03-06,Certification 3,Yes
146,Cindy Gomez,2018-09-14,2020-01-05,Navy,Jennifer Freeman,Air broker,Forest/woodland manager,New Jamesfurt,14,High School,Secret,2021-10-26,2023-02-12,Certification 1,Yes
147,Wayne Johnson,2020-02-06,2020-03-13,Coast Guard,Timothy Peters,Medical secretary,Forest/woodland manager,New Jamesfurt,4,High School,Top Secret,2021-11-02,2022-02-10,Certification 2,Yes
148,Ethan Stewart,2023-06-02,2023-06-26,Marines,Donald Church,Ophthalmologist,Forest/woodland manager,New Jamesfurt,25,High School,Confidential,2019-08-22,2019-11-19,Certification 2,Yes


In [41]:
# Save the DataFrame as a CSV file
result_df.to_csv('./Resources/employees_data.csv', index=False)

print("DataFrame saved as CSV file.")

DataFrame saved as CSV file.


# Query Examplesreset_index

## Retrieve all employees' names:

In [36]:
# Retrieve all employees' names:

# Query all employee names
query = "SELECT * FROM employees"
employees_names_df = pd.read_sql(query, engine)

# Display the result
employees_names_df

Unnamed: 0,name,contract_project_id,work_location_id,years_of_experience,education_id,clearance_id,origination_date,reinvestigation_date,date_joined,terminate_date,certification_id,hubzone,veterans,supervisor,job_title
0,Laura Lee,1,3,16,2,3,2020-03-09,2021-05-02,2022-02-11,2023-03-10,Certification 1,No,Navy,Mike Alvarado,Plant breeder/geneticist
1,Colton Gutierrez,7,1,11,4,2,2018-10-11,2019-10-19,2022-06-11,2022-11-10,Certification 3,Yes,Army,Ashley Goodwin,"Teacher, secondary school"
2,Marcus Jones,7,1,7,4,1,2023-04-05,2023-05-06,2022-10-22,2023-02-19,Certification 3,Yes,Coast Guard,Kimberly Baker,Lobbyist
3,Nathan Hunt,1,4,8,3,3,2021-11-24,2022-03-09,2021-09-24,2023-06-08,Certification 1,No,Air Force,Nina Ramirez,"Teacher, special educational needs"
4,Scott Nelson,1,2,19,2,2,2023-06-06,2023-06-26,2019-01-08,2022-06-15,Certification 1,Yes,Navy,Denise Fox,Planning and development surveyor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Thomas Schroeder,6,1,6,4,1,2020-11-04,2023-04-22,2021-08-13,2022-07-13,Certification 1,Yes,Army,Rebekah Smith,Orthoptist
996,Jennifer Garrett,10,2,17,4,2,2022-09-15,2023-01-22,2020-06-29,2023-06-12,Certification 3,Yes,Air Force,Frederick King V,Copy
997,Christine Todd,10,2,5,1,2,2019-12-23,2022-03-29,2021-07-31,2022-08-21,Certification 3,Yes,Army,Anna Vaughn,Applications developer
998,Jason Davis,5,1,11,1,2,2021-03-27,2022-09-03,2019-07-08,2019-12-24,Certification 2,Yes,Coast Guard,Stacey Ruiz,Minerals surveyor


## Retrieve the total number of employees:

In [11]:
# Retrieve the total number of employees:
query = """
SELECT COUNT(*) FROM employees;
"""

# Execute the query and fetch the data into a DataFrame
employee_count = pd.read_sql(query, engine)

# Display the result
print(employee_count)

   count
0   1000


## Retrieve all employees with their corresponding contract project details:

In [12]:
# Retrieve all employees with their corresponding contract project details:

query = """
SELECT e.name, cp.name AS contract_project_name
FROM employees e
JOIN contract_projects cp ON e.contract_project_id = cp.contract_project_id;
"""
contract_project = pd.read_sql(query, engine)

# Display the result
contract_project

Unnamed: 0,name,contract_project_name
0,Desiree Adams,"Engineer, automotive"
1,Penny Richardson,"Engineer, automotive"
2,Phillip Martinez,"Engineer, automotive"
3,Kristy Meyer PhD,"Engineer, automotive"
4,Darrell Foster,"Engineer, automotive"
...,...,...
995,Jeffrey Barnes,Careers adviser
996,Wanda Golden,Careers adviser
997,Rhonda Perez,Careers adviser
998,Steven Warren,Careers adviser


In [13]:
contract_project['contract_project_name'].unique()

array(['Engineer, automotive', 'Manufacturing engineer',
       'Medical laboratory scientific officer', 'Public house manager',
       'Conservation officer, nature', 'Site engineer',
       'Horticultural therapist', 'Airline pilot',
       'Community education officer', 'Careers adviser'], dtype=object)

## Retrieve all employees with their corresponding work location details:

In [14]:
# Retrieve all employees with their corresponding work location details:

# Query employees with more than 10 years of experience and their work locations
query = """
SELECT e.name, wl.name AS work_location_name
FROM employees e
JOIN work_locations wl ON e.work_location_id = wl.work_location_id;
"""
employee_locations = pd.read_sql(query, engine)

# Display the result
employee_locations

Unnamed: 0,name,work_location_name
0,Phillip Martinez,East Robert
1,Jason Thornton,East Robert
2,Edward Ramirez,East Robert
3,Thomas Fleming,East Robert
4,Tonya Carlson,East Robert
...,...,...
995,Joshua Flores,Lake Kimberly
996,Patricia Brown,Lake Kimberly
997,Benjamin Mason,Lake Kimberly
998,Rachel Stafford,Lake Kimberly


In [15]:
# Retrieve all employees with their corresponding education level:

query = """
SELECT e.name, ed.level AS education_level
FROM employees e
JOIN educations ed ON e.education_id = ed.education_id;
"""
education_level = pd.read_sql(query, engine)

# Display the result
education_level

Unnamed: 0,name,education_level
0,Samantha Jones,High School
1,Amy Allen,High School
2,Phillip Martinez,High School
3,Edward Ramirez,High School
4,Christopher Long,High School
...,...,...
995,Rachel Stafford,Ph.D.
996,Jeffrey Anthony,Ph.D.
997,Donald Thomas,Ph.D.
998,Andrew Doyle MD,Ph.D.


In [16]:
# Retrieve all employees with their corresponding clearance level and status:

query = """
SELECT e.name, c.level AS clearance_level, c.status AS clearance_status
FROM employees e
JOIN clearances c ON e.clearance_id = c.clearance_id;
"""
clearance_level = pd.read_sql(query, engine)

# Display the result
clearance_level

Unnamed: 0,name,clearance_level,clearance_status
0,Paige Burch,Confidential,Active
1,Crystal Henry,Confidential,Active
2,Desiree Adams,Confidential,Active
3,Samantha Jones,Confidential,Active
4,Amy Allen,Confidential,Active
...,...,...,...
1995,Patricia Brown,Top Secret,Inactive
1996,Gary Hicks,Top Secret,Inactive
1997,Dawn Chavez,Top Secret,Inactive
1998,Victoria Strong,Top Secret,Inactive


In [17]:
# Retrieve all employees who have a Top Secret clearance:

query = """
SELECT e.name
FROM employees e
JOIN clearances c ON e.clearance_id = c.clearance_id
WHERE c.level = 'Top Secret';
"""
top_clearance = pd.read_sql(query, engine)

# Display the result
top_clearance

Unnamed: 0,name
0,Dustin Rice DDS
1,Dustin Rice DDS
2,Victoria Strong
3,Victoria Strong
4,Dawn Chavez
...,...
667,Phillip Martinez
668,Brett Turner
669,Brett Turner
670,Denise Marshall


In [18]:
# Retrieve all employees who are assigned to a specific contract project:

# query = """
# SELECT e.name
# FROM employees e
# JOIN contract_projects cp ON e.contract_project_id = cp.contract_project_id
# WHERE cp.name = 'Contract Project Name';
# """


query = """
SELECT e.name
FROM employees e
JOIN contract_projects cp ON e.contract_project_id = cp.contract_project_id
WHERE cp.name = 'Microbiologist';

"""
specific_contract_project = pd.read_sql(query, engine)

# Display the result
specific_contract_project

Unnamed: 0,name


In [19]:
# Retrieve the average years of experience for employees:
query = """
SELECT AVG(years_of_experience) AS average_experience
FROM employees;
"""
average_experience = pd.read_sql(query, engine)

# Display the result
average_experience

Unnamed: 0,average_experience
0,15.455


In [20]:
# Retrieve the count of employees per education level:

query = """
SELECT ed.level AS education_level, COUNT(*) AS employee_count
FROM employees e
JOIN educations ed ON e.education_id = ed.education_id
GROUP BY ed.level;
"""
employee_education_count = pd.read_sql(query, engine)

# Display the result
employee_education_count

Unnamed: 0,education_level,employee_count
0,Ph.D.,262
1,Master's,250
2,High School,228
3,Bachelor's,260


In [21]:
# Retrieve the employees who have an active clearance:
query = """
SELECT e.name
FROM employees e
JOIN clearances c ON e.clearance_id = c.clearance_id
WHERE c.status = 'Active';
"""
active_clearance= pd.read_sql(query, engine)

# Display the result
active_clearance

Unnamed: 0,name
0,Dustin Rice DDS
1,Katie Cochran
2,Andrew Doyle MD
3,Donald Thomas
4,Jeffrey Anthony
...,...
995,Desiree Adams
996,Crystal Henry
997,Denise Marshall
998,Paige Burch


In [22]:
# Retrieve the employees who have more than 10 years of experience:

query = """
SELECT e.name, e.years_of_experience
FROM employees e
WHERE e.years_of_experience > 10;
"""
over_10_years = pd.read_sql(query, engine)

# Display the result
over_10_years

Unnamed: 0,name,years_of_experience
0,Katie Cochran,19
1,Andrew Doyle MD,20
2,Jeffrey Anthony,16
3,Victoria Strong,18
4,Samuel White,27
...,...,...
653,Sherry Smith,27
654,Desiree Adams,30
655,Crystal Henry,27
656,Denise Marshall,21


In [23]:
# Retrieve the count of employees per work location:

query = """
SELECT wl.name AS work_location, COUNT(*) AS employee_count
FROM employees e
JOIN work_locations wl ON e.work_location_id = wl.work_location_id
GROUP BY wl.name;
"""
employee_location_count = pd.read_sql(query, engine)

# Display the result
employee_location_count

Unnamed: 0,work_location,employee_count
0,East Robert,207
1,South Kirsten,176
2,Priceville,199
3,East Joelside,207
4,Lake Kimberly,211


In [24]:
# Retrieve the average years of experience per contract project:

query = """
SELECT cp.name AS contract_project, AVG(e.years_of_experience) AS average_experience
FROM employees e
JOIN contract_projects cp ON e.contract_project_id = cp.contract_project_id
GROUP BY cp.name;
"""
average_years_exp_contract = pd.read_sql(query, engine)

# Display the result
average_years_exp_contract

Unnamed: 0,contract_project,average_experience
0,Manufacturing engineer,15.603604
1,Medical laboratory scientific officer,15.61
2,Community education officer,14.940594
3,Airline pilot,16.122449
4,Site engineer,16.895349
5,Careers adviser,15.122449
6,"Conservation officer, nature",15.051546
7,Public house manager,14.77551
8,"Engineer, automotive",14.784483
9,Horticultural therapist,15.947368


In [25]:
# Retrieve the employees with their contract project, work location, and education level:

query = """
SELECT e.name, cp.name AS contract_project, wl.name AS work_location, ed.level AS education_level
FROM employees e
JOIN contract_projects cp ON e.contract_project_id = cp.contract_project_id
JOIN work_locations wl ON e.work_location_id = wl.work_location_id
JOIN educations ed ON e.education_id = ed.education_id;
"""
contract_project_location_education = pd.read_sql(query, engine)

# Display the result
contract_project_location_education

Unnamed: 0,name,contract_project,work_location,education_level
0,Tammy Johnson,Public house manager,East Joelside,High School
1,Danielle Hall,Careers adviser,Priceville,High School
2,Samantha Jones,Public house manager,East Joelside,High School
3,Heather Wiley,Medical laboratory scientific officer,East Joelside,High School
4,Cory Williams,Medical laboratory scientific officer,East Joelside,High School
...,...,...,...,...
995,Sarah Herring,Public house manager,East Joelside,Ph.D.
996,Fernando Calderon,Public house manager,Priceville,Ph.D.
997,Megan Buckley,Careers adviser,East Joelside,Ph.D.
998,James White,Manufacturing engineer,Priceville,Ph.D.
