In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Set a seed for NumPy's random number generator so that results are reproducible (same every time you run it)
np.random.seed(42)

# Define how many rows (records) you want in your dataset
num_rows = 120

# Create a dictionary where each key represents a column in the DataFrame and the value is a list of values
dict_data = {
    # Employee_ID: A sequence of unique IDs from 1001 to 1120 (120 IDs total)
    'Employee_ID': list(range(1001, 1001 + num_rows)),

    # Name: Randomly pick 120 names from the list of 10 provided names
    'Name': np.random.choice(
        ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ian', 'Jane'],
        num_rows).tolist(),  # Convert the NumPy array to a regular Python list

    # Age: Randomly select 120 values from a list that includes:
    #  - 110 integers between 22 and 59 (inclusive)
    #  - 10 np.nan (missing values), mixed in for realism
    'Age': np.random.choice(
        list(np.append(np.random.randint(22, 60, 110), [np.nan]*10)),
        num_rows).tolist(),

    # Gender: Randomly pick from 'Male', 'Female', 'Other', and some NaN values to simulate missing gender data
    'Gender': np.random.choice(
        ['Male', 'Female', 'Other', np.nan],
        num_rows).tolist(),

    # Department: Randomly choose from the list of departments, with some NaN values to simulate missing entries
    'Department': np.random.choice(
        ['Sales', 'Marketing', 'HR', 'Finance', 'IT', np.nan],
        num_rows).tolist(),

    # City: Randomly assign one of five African cities to each employee
    'City': np.random.choice(
        ['Nairobi', 'Kampala', 'Lagos', 'Accra', 'Johannesburg'],
        num_rows).tolist(),

    # Salary: Generate 110 salary values from a normal distribution centered at 50,000
    # Add 10 np.nan values to simulate missing salary data
    # Round the numbers to 2 decimal places and convert to list
    'Salary': np.round(np.random.choice(
        np.append(np.random.normal(50000, 15000, 110), [np.nan]*10),
        num_rows), 2).tolist(),

    # Performance_Score: Generate 110 random scores between 1.0 and 5.0, add 10 NaNs
    # Round to 1 decimal place to simulate performance review scores
    'Performance_Score': np.round(np.random.choice(
        np.append(np.random.uniform(1.0, 5.0, 110), [np.nan]*10),
        num_rows), 1).tolist(),

    # Join_Date: Randomly select 120 dates from a range between Jan 1, 2015 and Dec 31, 2022
    # Convert them to datetime format and to a list
    'Join_Date': pd.to_datetime(np.random.choice(
        pd.date_range('2015-01-01', '2022-12-31'),
        num_rows)).tolist(),

    # Active: Randomly assign either True or False to represent if the employee is currently active
    'Active': np.random.choice([True, False], num_rows).tolist()
}

# Convert the dictionary of lists into a pandas DataFrame
df = pd.DataFrame(dict_data)

# Create 5 rows where all columns are NaN (completely empty rows)
# This is useful for practicing how to drop rows with all missing data
empty_rows = {col: [np.nan]*5 for col in df.columns}
empty_df = pd.DataFrame(empty_rows)

# Append the 5 empty rows to the original 120-row DataFrame
df = pd.concat([df, empty_df], ignore_index=True)

# Show the first 10 rows of the DataFrame to check how the data looks
df.head(10)

  df = pd.concat([df, empty_df], ignore_index=True)


Unnamed: 0,Employee_ID,Name,Age,Gender,Department,City,Salary,Performance_Score,Join_Date,Active
0,1001.0,Grace,38.0,Other,Sales,Nairobi,76330.11,,2017-04-12,1.0
1,1002.0,David,41.0,Female,IT,Nairobi,50078.66,2.4,2015-11-09,1.0
2,1003.0,Hannah,47.0,Other,IT,Accra,67691.6,5.0,2022-09-07,0.0
3,1004.0,Eva,27.0,Other,Sales,Nairobi,36523.78,4.1,2020-02-02,1.0
4,1005.0,Grace,51.0,Female,IT,Johannesburg,50528.95,2.3,2020-06-30,0.0
5,1006.0,Jane,53.0,Female,IT,Lagos,50078.66,4.1,2020-06-29,0.0
6,1007.0,Charlie,,Female,,Lagos,60045.09,4.0,2021-10-26,1.0
7,1008.0,Grace,27.0,Male,IT,Nairobi,78142.56,4.2,2019-11-26,1.0
8,1009.0,Hannah,51.0,,HR,Accra,30432.96,1.0,2015-11-24,0.0
9,1010.0,Eva,,Male,Finance,Accra,25903.31,3.0,2022-09-09,0.0
