In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np

# Step 2: Create Sample Employee Data (New Records)
data = {
    "Employee_ID": [11, 12, 13, 14, 15, 16, 17, 17],
    "Name": ["Alice", "Bob", "Charlie", None, "Eva", "Frank", "George", "George"],
    "Department": ["Finance", "IT", None, "HR", "IT", "Finance", None, None],
    "Salary": [72000, None, 68000, 59000, None, 75000, 62000, 62000],
    "Joining_Date": ["2022-01-10", "2021-03-15", "2020-07-20", None, "2023-06-01", "2022-09-12", "2021-11-05", "2021-11-05"]
}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# Step 3: Identify Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# Step 4: Fill Missing Department & Salary Values
df["Department"].fillna("Unknown", inplace=True)
df["Salary"].fillna(df["Salary"].median(), inplace=True)
print("\nAfter Filling Missing Values:\n", df)

# Step 5: Convert Joining_Date to Date Format
df["Joining_Date"] = pd.to_datetime(df["Joining_Date"])
print("\nData Types After Conversion:\n", df.dtypes)

# Step 6: Remove Duplicate Employee Records
df = df.drop_duplicates()
print("\nAfter Removing Duplicates:\n", df)

# Step 7: Rename Columns
df = df.rename(columns={"Salary": "Annual_Salary", "Joining_Date": "Start_Date"})
print("\nFinal Cleaned Employee Data:\n", df.head())


Original Data:
    Employee_ID     Name Department   Salary Joining_Date
0           11    Alice    Finance  72000.0   2022-01-10
1           12      Bob         IT      NaN   2021-03-15
2           13  Charlie       None  68000.0   2020-07-20
3           14     None         HR  59000.0         None
4           15      Eva         IT      NaN   2023-06-01
5           16    Frank    Finance  75000.0   2022-09-12
6           17   George       None  62000.0   2021-11-05
7           17   George       None  62000.0   2021-11-05

Missing Values:
 Employee_ID     0
Name            1
Department      3
Salary          2
Joining_Date    1
dtype: int64

After Filling Missing Values:
    Employee_ID     Name Department   Salary Joining_Date
0           11    Alice    Finance  72000.0   2022-01-10
1           12      Bob         IT  65000.0   2021-03-15
2           13  Charlie    Unknown  68000.0   2020-07-20
3           14     None         HR  59000.0         None
4           15      Eva         I

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Department"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Salary"].fillna(df["Salary"].median(), inplace=True)
