##### Setting up the environment and installing the required libraries 

In [7]:
%pip install psycopg2-binary pandas matplotlib seaborn faker sqlalchemy

Note: you may need to restart the kernel to use updated packages.


##### importing required libraries 

In [8]:
from faker import Faker
import pandas as pd
import random
from datetime import datetime

##### Generate & Populate Data

In [9]:
from datetime import date


fake = Faker()
positions = ['Software Engineer', 'Data Analyst', 'IT Support', 'DevOps Engineer', 'Cloud Architect']


start_range = date(2015, 1, 1)
end_range = date(2024, 12, 31)


employee_data = []
for _ in range(50):
    name = fake.name()
    position = random.choice(positions)
    start_date = fake.date_between(start_date=start_range, end_date=end_range)
    salary = random.randint(60000, 200000)
    employee_data.append((name, position, start_date, salary))

df_employees = pd.DataFrame(employee_data, columns=['name', 'position', 'start_date', 'salary'])
df_employees.head()

Unnamed: 0,name,position,start_date,salary
0,Makayla Irwin,Software Engineer,2023-01-30,142809
1,Joseph Brown,Software Engineer,2015-02-17,178628
2,Andrea Thomas,Software Engineer,2016-08-27,77648
3,Andrew Martin,DevOps Engineer,2020-03-10,90083
4,Amy Wilson,Data Analyst,2016-07-05,131568


##### Conneting the database 

In [10]:
import psycopg2
conn = psycopg2.connect("postgresql://neondb_owner:npg_AYSBaf8TUhF3@ep-sweet-dust-a8827t3a-pooler.eastus2.azure.neon.tech/neondb?sslmode=require")
cur = conn.cursor()


for row in employee_data:
    cur.execute("""
        INSERT INTO employees (name, position, start_date, salary)
        VALUES (%s, %s, %s, %s)
    """, row)

conn.commit()
cur.close()
conn.close()

print("50 employee records inserted into the database.")
print(df_employees.head())

50 employee records inserted into the database.
            name           position  start_date  salary
0  Makayla Irwin  Software Engineer  2023-01-30  142809
1   Joseph Brown  Software Engineer  2015-02-17  178628
2  Andrea Thomas  Software Engineer  2016-08-27   77648
3  Andrew Martin    DevOps Engineer  2020-03-10   90083
4     Amy Wilson       Data Analyst  2016-07-05  131568


#####   Data Collection: Created a neon account, created new project in that and wrote a query to create employees table and conected through connetion string. 

##### Data cleaning: using isnull() method and checking the missing values 

In [11]:
print(df_employees.head())

            name           position  start_date  salary
0  Makayla Irwin  Software Engineer  2023-01-30  142809
1   Joseph Brown  Software Engineer  2015-02-17  178628
2  Andrea Thomas  Software Engineer  2016-08-27   77648
3  Andrew Martin    DevOps Engineer  2020-03-10   90083
4     Amy Wilson       Data Analyst  2016-07-05  131568


##### Data transformation: creating a new column called years_of_service 

In [12]:

df_employees['start_year'] = pd.DatetimeIndex(df_employees['start_date']).year


df_employees['years_of_service'] = datetime.now().year - df_employees['start_year']

df_employees[['start_date', 'start_year', 'years_of_service']].head()

Unnamed: 0,start_date,start_year,years_of_service
0,2023-01-30,2023,2
1,2015-02-17,2015,10
2,2016-08-27,2016,9
3,2020-03-10,2020,5
4,2016-07-05,2016,9
