In [1]:
import pandas as pd
import numpy as np

file_path = r"C:\Users\SKY\Desktop\JOB prep\Repostiores folders\Project-3 HR Analytics\DATA\WA_Fn-UseC_-HR-Employee-Attrition.csv"
df = pd.read_csv(file_path)

print("Dataset Shape:", df.shape)
df.info()


Dataset Shape: (1470, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14

In [4]:
# Checking For Empty Cells
null_counts = df.isnull().sum()
print("Null/Missing values per column:")
print(null_counts[null_counts > 0])

if null_counts.sum() == 0:
    print("No missing values found in the dataset")

print(f"\nTotal missing values: {null_counts.sum()}")
print(f"Percentage of missing data: {(null_counts.sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")


Null/Missing values per column:
Series([], dtype: int64)
No missing values found in the dataset

Total missing values: 0
Percentage of missing data: 0.00%


In [6]:
# Checking For Empty Strings
empty_strings = pd.DataFrame()
for col in df.select_dtypes(include=['object']).columns:
    empty_count = (df[col] == '').sum()
    if empty_count > 0:
        empty_strings[col] = [empty_count]
if len(empty_strings) > 0:
    print("Empty strings found:")
    print(empty_strings)
else:
    print("No empty strings found")

No empty strings found


In [7]:
# CHECKING FOR DUPLICATES
duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")
if duplicate_rows > 0:
    print("Duplicate rows found:")
    print(df[df.duplicated()])
duplicate_employees = df['EmployeeNumber'].duplicated().sum()
print(f"Duplicate employee numbers: {duplicate_employees}")
if duplicate_employees > 0:
    print("Duplicate employee numbers found:")
    print(df[df['EmployeeNumber'].duplicated(keep=False)].sort_values('EmployeeNumber'))

Number of duplicate rows: 0
Duplicate employee numbers: 0


In [8]:
# CHECKING CATEGORICAL DATA CONSISTENCY
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    unique_values = df[col].unique()
    print(f"\n{col}: {len(unique_values)} unique values")
    print(f"Values: {list(unique_values)}")
    if col in ['Attrition', 'Gender', 'Over18', 'OverTime']:
        value_counts = df[col].value_counts()
        print(f"Distribution: {dict(value_counts)}")


Attrition: 2 unique values
Values: ['Yes', 'No']
Distribution: {'No': np.int64(1233), 'Yes': np.int64(237)}

BusinessTravel: 3 unique values
Values: ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel']

Department: 3 unique values
Values: ['Sales', 'Research & Development', 'Human Resources']

EducationField: 6 unique values
Values: ['Life Sciences', 'Other', 'Medical', 'Marketing', 'Technical Degree', 'Human Resources']

Gender: 2 unique values
Values: ['Female', 'Male']
Distribution: {'Male': np.int64(882), 'Female': np.int64(588)}

JobRole: 9 unique values
Values: ['Sales Executive', 'Research Scientist', 'Laboratory Technician', 'Manufacturing Director', 'Healthcare Representative', 'Manager', 'Sales Representative', 'Research Director', 'Human Resources']

MaritalStatus: 3 unique values
Values: ['Single', 'Married', 'Divorced']

Over18: 1 unique values
Values: ['Y']
Distribution: {'Y': np.int64(1470)}

OverTime: 2 unique values
Values: ['Yes', 'No']
Distribution: {'No': np.int64(

In [10]:
# CHECKING FOR CONSTANT COLUMNS
constant_cols = []
for col in df.columns:
    if df[col].nunique() == 1:
        constant_cols.append(col)
        print(f"{col}: constant value = {df[col].iloc[0]}")
if not constant_cols:
    print("No constant columns found")

EmployeeCount: constant value = 1
Over18: constant value = Y
StandardHours: constant value = 80


In [13]:
# SAVING DATASET
output_path = r"C:\Users\SKY\Desktop\JOB prep\Repostiores folders\Project-3 HR Analytics\DATA\WA_Fn-UseC_-HR-Employee-Attrition_checked.csv"

try:
    df.to_csv(output_path, index=False)
    print(f"Dataset saved successfully at: {output_path}")
    print(f"Dataset shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum():,} bytes")
except Exception as e:
    print(f"Error saving file: {str(e)}")


Dataset saved successfully at: C:\Users\SKY\Desktop\JOB prep\Repostiores folders\Project-3 HR Analytics\DATA\WA_Fn-UseC_-HR-Employee-Attrition_checked.csv
Dataset shape: (1470, 35)
Memory usage: 1,065,501 bytes
