In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("employee_data.csv")
print("Initial shape:", df.shape)
print(df.head())

# 1. Standardize column names
df.columns = (df.columns
                .str.strip()
                .str.lower()
                .str.replace(" ", "_")
                .str.replace(r"[()]", "", regex=True))
print("\nColumns after rename:", df.columns.tolist())

# 2. Check missing values
print("\nMissing values:\n", df.isnull().sum())

# Example handling: Fill missing gender with "Unknown"
if "gender" in df.columns:
    df["gender"] = df["gender"].astype(str).str.strip().str.lower()
    df["gender"] = df["gender"].replace({"m":"male", "f":"female"})
    df["gender"].fillna("unknown", inplace=True)

# Example handling: Drop rows with missing salary
if "salary" in df.columns:
    df["salary"] = pd.to_numeric(df["salary"], errors="coerce")
    df = df.dropna(subset=["salary"])

# 3. Remove duplicate employees
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"\nRemoved {before - after} duplicate rows.")

# 4. Convert data types
if "emp_id" in df.columns:
    df["emp_id"] = pd.to_numeric(df["emp_id"], errors="coerce").astype("Int64")
if "hire_date" in df.columns:
    df["hire_date"] = pd.to_datetime(df["hire_date"], errors="coerce")

print("\nData types:\n", df.dtypes)

# 5. Save cleaned file
df.to_csv("employee_data_cleaned.csv", index=False)
print("\nCleaned file saved as employee_data_cleaned.csv")


Initial shape: (3000, 26)
   EmpID FirstName LastName  StartDate ExitDate                    Title  \
0   3427     Uriah  Bridges  20-Sep-19      NaN  Production Technician I   
1   3428     Paula    Small  11-Feb-23      NaN  Production Technician I   
2   3429    Edward     Buck  10-Dec-18      NaN       Area Sales Manager   
3   3430   Michael  Riordan  21-Jun-21      NaN       Area Sales Manager   
4   3431   Jasmine    Onque  29-Jun-19      NaN       Area Sales Manager   

        Supervisor                        ADEmail BusinessUnit EmployeeStatus  \
0     Peter Oneill    uriah.bridges@bilearner.com         CCDR         Active   
1  Renee Mccormick      paula.small@bilearner.com           EW         Active   
2   Crystal Walker      edward.buck@bilearner.com           PL         Active   
3   Rebekah Wright  michael.riordan@bilearner.com         CCDR         Active   
4        Jason Kim    jasmine.onque@bilearner.com          TNS         Active   

   ...              Division  