In [3]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np

# Step 2: Create Sample Employee Attendance Data
data = {
    "Employee_ID": [201, 202, 203, 204, 205, 206, 207, 207],
    "Employee_Name": ["Alice", "Bob", "Charlie", None, "Eva", "Frank", "George", "George"],
    "Date": ["2024-08-01", "2024-08-01", "2024-08-01", "2024-08-01", None, "2024-08-02", "2024-08-02", "2024-08-02"],
    "Check_In": ["09:00", "09:15", None, "08:50", "09:05", None, "09:10", "09:10"],
    "Check_Out": ["17:00", "17:05", "16:50", None, "17:10", "17:00", None, "17:00"]
}

df = pd.DataFrame(data)
print("Original Attendance Data:\n", df)

# Step 3: Identify Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# Step 4: Drop Rows with Missing Date or Employee_ID (Critical Data)
df = df.dropna(subset=["Employee_ID", "Date"])
print("\nAfter Dropping Critical Missing Values:\n", df)

# Step 5: Fill Missing Check-In and Check-Out Times
df["Check_In"].fillna("09:00", inplace=True)  # Default check-in time
df["Check_Out"].fillna("17:00", inplace=True)  # Default check-out time
df["Employee_Name"].fillna("Unknown", inplace=True)  # Fill missing employee names

print("\nAfter Filling Missing Values:\n", df)


# Step 6: Convert Date and Time Columns to Proper Data Types
df["Date"] = pd.to_datetime(df["Date"])
df["Check_In"] = pd.to_datetime(df["Date"].astype(str) + " " + df["Check_In"])
df["Check_Out"] = pd.to_datetime(df["Date"].astype(str) + " " + df["Check_Out"])

print("\nData Types After Conversion:\n", df.dtypes)

# Step 7: Remove Duplicate Records
df = df.drop_duplicates()
print("\nAfter Removing Duplicates:\n", df)

# Step 8: Rename Columns for Clarity
df = df.rename(columns={
    "Check_In": "Clock_In",
    "Check_Out": "Clock_Out"
})

print("\nFinal Cleaned Attendance Data:\n", df.head())


Original Attendance Data:
    Employee_ID Employee_Name        Date Check_In Check_Out
0          201         Alice  2024-08-01    09:00     17:00
1          202           Bob  2024-08-01    09:15     17:05
2          203       Charlie  2024-08-01     None     16:50
3          204          None  2024-08-01    08:50      None
4          205           Eva        None    09:05     17:10
5          206         Frank  2024-08-02     None     17:00
6          207        George  2024-08-02    09:10      None
7          207        George  2024-08-02    09:10     17:00

Missing Values:
 Employee_ID      0
Employee_Name    1
Date             1
Check_In         2
Check_Out        2
dtype: int64

After Dropping Critical Missing Values:
    Employee_ID Employee_Name        Date Check_In Check_Out
0          201         Alice  2024-08-01    09:00     17:00
1          202           Bob  2024-08-01    09:15     17:05
2          203       Charlie  2024-08-01     None     16:50
3          204          N

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Check_In"].fillna("09:00", inplace=True)  # Default check-in time
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Check_Out"].fillna("17:00", inplace=True)  # Default check-out time
The behavior will change in pandas 3.0. This inplace method will never work because the in