In [None]:
import pandas as pd
edf=pd.read_csv("employee_data_raw.csv")
print("\n" + "=" *60)
print("Employee Data - Raw")
print("=" *60)

# Data Quality Checks
print("\nData Types:\n", edf.dtypes)
print("\nData set shape", edf.shape)
print("\n Memory Usage:\n", edf.memory_usage())









Employee Data - Raw

Data Types:
 Employee_ID            object
Name                   object
Age                   float64
Department             object
Position               object
Salary                float64
Join_Date              object
City                   object
Email                  object
Performance_Rating    float64
dtype: object

Data set shape (200, 10)

 Memory Usage:
 Index                  132
Employee_ID           1600
Name                  1600
Age                   1600
Department            1600
Position              1600
Salary                1600
Join_Date             1600
City                  1600
Email                 1600
Performance_Rating    1600
dtype: int64

Missing Data:
 Age                    8
Salary                12
Email                 10
Performance_Rating     7
dtype: int64


In [None]:
# Missing Data Information
missing_data = edf.isnull().sum()
missing_percentage = (missing_data / len(edf)) * 100
missing_info = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_percentage})
print("\nMissing Data Information:\n", missing_info)



Missing Data Information:
                     Missing Values  Percentage
Employee_ID                      0         0.0
Name                             0         0.0
Age                              8         4.0
Department                       0         0.0
Position                         0         0.0
Salary                          12         6.0
Join_Date                        0         0.0
City                             0         0.0
Email                           10         5.0
Performance_Rating               7         3.5


In [10]:
# Duplicate Records
duplicate=edf.duplicated().sum()
print(f"\nNumber of Duplicate Records: {duplicate}")


Number of Duplicate Records: 0


In [11]:
# Data Range Analysis
edf.describe()

Unnamed: 0,Age,Salary,Performance_Rating
count,192.0,188.0,193.0
mean,42.703125,104451.457447,2.896373
std,13.944182,41814.762228,1.436009
min,0.0,5000.0,1.0
25%,32.0,78077.0,2.0
50%,43.0,106676.5,3.0
75%,54.0,134847.25,4.0
max,64.0,197796.0,5.0


In [None]:
# Data Cleaning
import numpy as np
df_clean=edf.copy()

#Clean name column by removing leading/trailing spaces
df_clean['Name'] = df_clean['Name'].str.strip()
df_clean['Department'] = df_clean['Department'].str.title()
df_clean['Age']=df_clean['Age'].replace(0,np.nan)
df_clean['Age']=df_clean['Age'].fillna(df_clean['Age'].median())
df_clean['Salary']=df_clean['Salary'].replace(0,np.nan)
df_clean['Salary']=df_clean['Salary'].fillna(df_clean['Salary'].mean())

missing_email=df_clean['Email'].isnull()
for idx in df_clean[missing_email].index:
    name=df_clean.loc[idx,'Name'].replace(" ",".").lower()
    df_clean.loc[idx,'Email']=f"{name}@company.com"


df_clean['Performance_Rating']=df_clean['Performance_Rating'].fillna(df_clean['Performance_Rating'].mean())





In [31]:
def age_group(age):
    if age < 30:
        return '20-29'
    elif age < 40:
        return '30-39'
    elif age < 50:
        return '40-49'
    else:
        return '50+'
    
df_clean['Age_Group']=df_clean['Age'].apply(age_group)    

def calculate_performance_level(rating):
    if rating >= 4.5:
        return 'Excellent'
    elif rating >= 3.5:
        return 'Good'
    elif rating >= 2.5:
        return 'Average'
    else:
        return 'Below Average'

df_clean['performance_level']=df_clean['Performance_Rating'].apply(calculate_performance_level)
df_clean.to_csv("employee_data_cleaned.csv",index=False)