In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data
df = pd.read_csv('Existing Staff.csv')
df.head()

Unnamed: 0,S/N,Job Title,Department,Age,Gender,Marital Status,Years of Service,Salary
0,1,HR Specialist,Human Resources,25,Male,Married,2,537.25
1,2,Project Manager,Sales & Marketing,53,Male,Single,8,154.13
2,3,Billing Specialist,IT & Software,44,Female,Married,8,368.54
3,4,Marketing Analyst,Data Analytics,37,Female,Married,7,269.92
4,5,Product Manager,IT & Software,30,Male,Single,4,131.17


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   S/N               3500 non-null   int64  
 1   Job Title         3500 non-null   object 
 2   Department        3500 non-null   object 
 3   Age               3500 non-null   int64  
 4   Gender            3500 non-null   object 
 5   Marital Status    3500 non-null   object 
 6   Years of Service  3500 non-null   int64  
 7   Salary            3500 non-null   float64
dtypes: float64(1), int64(3), object(4)
memory usage: 218.9+ KB


In [5]:
# check duplicates
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


In [9]:
from cleaner import DataCleaner  #using custom DataCleaner class


# instantiate the cleaner
cleaner = DataCleaner(df, verbose=True)
# clean the data
df_clean = cleaner.clean()
# generate a report
report = cleaner.report()

Starting data cleaning process...
Removed 0 duplicate rows
Cleaned text in column: Job Title
Cleaned text in column: Department
Cleaned text in column: Gender
Cleaned text in column: Marital Status
Downcasted S/N from int64 to int16
Converted Department to category dtype
Downcasted Age from int64 to int8
Converted Gender to category dtype
Converted Marital Status to category dtype
Downcasted Years of Service from int64 to int8
Downcasted Salary from float64 to float32
Capped 46 outliers in Salary to [-7.38, 706.16]

=== DATA CLEANING SUMMARY ===
Initial dimensions: (3500, 8)
Final dimensions: (3500, 8)
Missing values handled: 0 → 0

Outlier handling:
- Salary: 46 capped (bounds: [-7.38, 706.16])

Data type conversions:
- S/N: int16
- Department: category
- Age: int8
- Gender: category
- Marital Status: category
- Years of Service: int8
- Salary: float32

Data cleaning completed successfully!


In [7]:
df_clean.head()

Unnamed: 0,S/N,Job Title,Department,Age,Gender,Marital Status,Years of Service,Salary
0,1,hr specialist,human resources,25,male,married,2,537.25
1,2,project manager,sales & marketing,53,male,single,8,154.130005
2,3,billing specialist,it & software,44,female,married,8,368.540009
3,4,marketing analyst,data analytics,37,female,married,7,269.920013
4,5,product manager,it & software,30,male,single,4,131.169998


In [10]:
help(DataCleaner)

Help on class DataCleaner in module cleaner:

class DataCleaner(builtins.object)
 |  DataCleaner(
 |      df: pandas.core.frame.DataFrame,
 |      drop_duplicates: bool = True,
 |      handle_missing: str = 'auto',
 |      text_cleanup: bool | list = True,
 |      numeric_outliers: bool = True,
 |      outlier_method: str = 'cap',
 |      iqr_multiplier: float = 1.5,
 |      category_threshold: int = 10,
 |      verbose: bool = True,
 |      missing_categorical_value: str = 'MISSING'
 |  )
 |
 |  Comprehensive data cleaning pipeline for pandas DataFrames using OOP approach
 |
 |  Features:
 |  - Duplicate removal
 |  - Text standardization
 |  - Missing value handling
 |  - Data type optimization
 |  - Outlier management
 |  - Detailed cleaning reports
 |
 |  Usage:
 |  >>> cleaner = DataCleaner(df)
 |  >>> cleaned_df = cleaner.clean()
 |  >>> report = cleaner.report()
 |
 |  Methods defined here:
 |
 |  __init__(
 |      self,
 |      df: pandas.core.frame.DataFrame,
 |      drop_dupl

In [11]:
report

{'initial_shape': (3500, 8),
 'duplicates_removed': 0,
 'missing_values_initial': np.int64(0),
 'missing_values_final': np.int64(0),
 'type_conversions': {'S/N': 'int16',
  'Department': 'category',
  'Age': 'int8',
  'Gender': 'category',
  'Marital Status': 'category',
  'Years of Service': 'int8',
  'Salary': 'float32'},
 'outliers': {'Salary': {'method': 'capped',
   'count': 46,
   'lower_bound': np.float64(-7.3799896240234375),
   'upper_bound': np.float64(706.1599884033203)}},
 'rows_removed_outliers': 0,
 'final_shape': (3500, 8)}

In [8]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   S/N               3500 non-null   int16   
 1   Job Title         3500 non-null   object  
 2   Department        3500 non-null   category
 3   Age               3500 non-null   int8    
 4   Gender            3500 non-null   category
 5   Marital Status    3500 non-null   category
 6   Years of Service  3500 non-null   int8    
 7   Salary            3500 non-null   float64 
dtypes: category(3), float64(1), int16(1), int8(2), object(1)
memory usage: 79.3+ KB


In [12]:
df_clean['Department'].nunique()

9