### Import Required Libraries

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Load dataset

In [2]:
df = pd.read_csv("employee_attrition.csv")
print(df.head())

   Age   Salary  Experience  WorkLifeBalance Department  Gender Attrition
0   25  40000.0         2.0                3      Sales    Male       Yes
1   30  50000.0         5.0                4         HR  Female        No
2   45  80000.0        15.0                2         IT    Male        No
3   29      NaN         3.0                3    Finance  Female       Yes
4   38  60000.0        10.0                4      Sales  Female        No


### Explore dataset

In [3]:
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)
print("\nAttrition distribution:\n", df['Attrition'].value_counts())


Missing values:
 Age                0
Salary             1
Experience         1
WorkLifeBalance    0
Department         0
Gender             0
Attrition          0
dtype: int64

Data types:
 Age                  int64
Salary             float64
Experience         float64
WorkLifeBalance      int64
Department          object
Gender              object
Attrition           object
dtype: object

Attrition distribution:
 Attrition
No     6
Yes    4
Name: count, dtype: int64


### Handle missing values

In [5]:
imputer = SimpleImputer(strategy="mean")
df[['Age','Salary','Experience']] = imputer.fit_transform(df[['Age','Salary','Experience']])

print("\nDataset after Handling Missing Values:\n", df)


Dataset after Handling Missing Values:
     Age         Salary  Experience  WorkLifeBalance Department  Gender  \
0  25.0   40000.000000    2.000000                3      Sales    Male   
1  30.0   50000.000000    5.000000                4         HR  Female   
2  45.0   80000.000000   15.000000                2         IT    Male   
3  29.0   74777.777778    3.000000                3    Finance  Female   
4  38.0   60000.000000   10.000000                4      Sales  Female   
5  42.0  120000.000000   20.000000                1         IT    Male   
6  35.0   58000.000000   10.666667                3         HR  Female   
7  50.0  150000.000000   25.000000                2    Finance    Male   
8  28.0   45000.000000    4.000000                4      Sales  Female   
9  40.0   70000.000000   12.000000                3         IT    Male   

  Attrition  
0       Yes  
1        No  
2        No  
3       Yes  
4        No  
5       Yes  
6        No  
7        No  
8       Yes  
9   

### Handle categorical features

In [6]:
df = pd.get_dummies(df, columns=['Department','Gender'], drop_first=True)
print(df)

    Age         Salary  Experience  WorkLifeBalance Attrition  Department_HR  \
0  25.0   40000.000000    2.000000                3       Yes          False   
1  30.0   50000.000000    5.000000                4        No           True   
2  45.0   80000.000000   15.000000                2        No          False   
3  29.0   74777.777778    3.000000                3       Yes          False   
4  38.0   60000.000000   10.000000                4        No          False   
5  42.0  120000.000000   20.000000                1       Yes          False   
6  35.0   58000.000000   10.666667                3        No           True   
7  50.0  150000.000000   25.000000                2        No          False   
8  28.0   45000.000000    4.000000                4       Yes          False   
9  40.0   70000.000000   12.000000                3        No          False   

   Department_IT  Department_Sales  Gender_Male  
0          False              True         True  
1          False   

### Remove outliers (IQR method for Salary & Experience)

In [7]:
for col in ['Salary','Experience']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df[col] >= Q1 - 1.5*IQR) & (df[col] <= Q3 + 1.5*IQR)]

### Define features and target

In [8]:
X = df.drop("Attrition", axis=1)
y = df["Attrition"].map({'No':0,'Yes':1})  # Encode target

### Train-test split (80/20)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTraining Set Size:", X_train.shape) 
print("Testing Set Size:", X_test.shape)


Training Set Size: (6, 8)
Testing Set Size: (2, 8)


### StandardScaler for Salary, Experience

In [11]:
scaler = StandardScaler()
X_train[['Salary','Experience']] = scaler.fit_transform(X_train[['Salary','Experience']])
X_test[['Salary','Experience']] = scaler.transform(X_test[['Salary','Experience']])

print("\nStandardized Data (First 3 rows):\n", X_train[:3]) 


Standardized Data (First 3 rows):
     Age    Salary  Experience  WorkLifeBalance  Department_HR  Department_IT  \
8  28.0 -1.432938   -0.936257                4          False          False   
2  45.0  1.308208    1.535462                2          False           True   
1  30.0 -1.041346   -0.711556                4           True          False   

   Department_Sales  Gender_Male  
8              True        False  
2             False         True  
1             False        False  


### MinMaxScaler for Age

In [12]:
minmax = MinMaxScaler()
X_train[['Age']] = minmax.fit_transform(X_train[['Age']])
X_test[['Age']] = minmax.transform(X_test[['Age']])

print("\nNormalized Data (First 3 rows):\n", X_train[:3]) 


Normalized Data (First 3 rows):
         Age    Salary  Experience  WorkLifeBalance  Department_HR  \
8  0.000000 -1.432938   -0.936257                4          False   
2  1.000000  1.308208    1.535462                2          False   
1  0.117647 -1.041346   -0.711556                4           True   

   Department_IT  Department_Sales  Gender_Male  
8          False              True        False  
2           True             False         True  
1          False             False        False  


### Check preprocessed data

In [13]:
print("\nProcessed Training Data Sample:\n", X_train.head())


Processed Training Data Sample:
         Age    Salary  Experience  WorkLifeBalance  Department_HR  \
8  0.000000 -1.432938   -0.936257                4          False   
2  1.000000  1.308208    1.535462                2          False   
1  0.117647 -1.041346   -0.711556                4           True   
3  0.058824  0.899212   -1.160959                3          False   
4  0.588235 -0.258161    0.411953                4          False   

   Department_IT  Department_Sales  Gender_Male  
8          False              True        False  
2           True             False         True  
1          False             False        False  
3          False             False        False  
4          False              True        False  


###  Save cleaned dataset

In [14]:
processed_data = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
processed_data.to_csv("employee_attrition_processed.csv", index=False)
print("\n Processed data saved as employee_attrition_processed.csv")


 Processed data saved as employee_attrition_processed.csv
