In [1]:
# === Employee Data Analysis ===

import pandas as pd
import numpy as np
import os

# Set the path to your actual file (update if needed)
file_path = 'Employee_data.csv'

# === Try loading your dataset ===
if os.path.exists(file_path):
    employee_data = pd.read_csv(file_path)
    print("✅ Dataset loaded successfully.")
else:
    print("⚠️ File not found. Using sample data instead.")
    # Creating a sample DataFrame
    data = {
        'Employee ID': [1, 2, 3, 4, 5],
        'Name': ['Alice', 'Bob ', ' Charlie', 'Diana', 'Evan'],
        'Age': [29, 34, 42, 28, np.nan],
        'Department': ['HR ', 'Finance', 'IT', 'Finance', ' HR'],
        'Salary': [60000, 70000, 80000, 75000, 62000],
        'Years at Company': [2, 4, 5, 3, 1],
        'Performance Score': [3.5, 4.0, 3.9, 4.2, np.nan],
        'Attrition': ['No', 'Yes', 'No', 'No', 'Yes']
    }
    employee_data = pd.DataFrame(data)

# === Preview dataset ===
print("\n📄 First few rows of the dataset:")
print(employee_data.head())

# === Check for missing values ===
print("\n🔍 Missing values in each column:")
print(employee_data.isnull().sum())

# === Fill missing numerical values with column means ===
numeric_cols = employee_data.select_dtypes(include=[np.number]).columns
employee_data[numeric_cols] = employee_data[numeric_cols].fillna(employee_data[numeric_cols].mean())

# === Remove duplicates ===
before = len(employee_data)
employee_data.drop_duplicates(inplace=True)
after = len(employee_data)
print(f"\n🧹 Removed {before - after} duplicate rows.")

# === Clean whitespace in object columns ===
for col in employee_data.select_dtypes(include='object').columns:
    employee_data[col] = employee_data[col].str.strip()

# === Print cleaned dataset ===
print("\n✅ Cleaned Dataset Preview:")
print(employee_data)

# === Summary ===
print("\n📊 Dataset Summary:")
print(employee_data.describe(include='all'))


⚠️ File not found. Using sample data instead.

📄 First few rows of the dataset:
   Employee ID      Name   Age Department  Salary  Years at Company  \
0            1     Alice  29.0        HR    60000                 2   
1            2      Bob   34.0    Finance   70000                 4   
2            3   Charlie  42.0         IT   80000                 5   
3            4     Diana  28.0    Finance   75000                 3   
4            5      Evan   NaN         HR   62000                 1   

   Performance Score Attrition  
0                3.5        No  
1                4.0       Yes  
2                3.9        No  
3                4.2        No  
4                NaN       Yes  

🔍 Missing values in each column:
Employee ID          0
Name                 0
Age                  1
Department           0
Salary               0
Years at Company     0
Performance Score    1
Attrition            0
dtype: int64

🧹 Removed 0 duplicate rows.

✅ Cleaned Dataset Preview:
   Empl