In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# LOAD DATA
data = pd.read_csv("/content/Employee_Attrition_Cleaned.csv")

In [4]:
# BASIC EDA
print(data.info())
print(data['Attrition'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1470 non-null   float64
 1   Attrition                 1470 non-null   object 
 2   BusinessTravel            1470 non-null   object 
 3   DailyRate                 1470 non-null   float64
 4   Department                1470 non-null   object 
 5   DistanceFromHome          1470 non-null   float64
 6   Education                 1470 non-null   float64
 7   EducationField            1470 non-null   object 
 8   EnvironmentSatisfaction   1470 non-null   float64
 9   Gender                    1470 non-null   object 
 10  HourlyRate                1470 non-null   float64
 11  JobInvolvement            1470 non-null   float64
 12  JobLevel                  1470 non-null   float64
 13  JobRole                   1470 non-null   object 
 14  JobSatis

In [5]:
# HANDLE MISSING VALUES
data.fillna(method='ffill', inplace=True)

  data.fillna(method='ffill', inplace=True)


In [6]:
# FEATURE ENGINEERING EXAMPLE
data['PromotionGap'] = data['YearsAtCompany']-data['YearsSinceLastPromotion']

In [7]:
# DEFINE FEATURES & TARGET
X = data.drop('Attrition', axis=1)
y = data['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

In [8]:
# IDENTIFY CATEGORICAL & NUMERICAL COLUMN
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [9]:
# PREPROCESSING PIPELINE
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [10]:
# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [11]:
# LOGISTIC REGRESSION PIPELINE
logreg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('classifier', LogisticRegression(max_iter=1000))])

In [12]:
# TRAIN LOGISTIC REGRESSION
logreg_pipeline.fit(X_train, y_train)
y_pred_logreg = logreg_pipeline.predict(X_test)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       247
           1       0.62      0.32      0.42        47

    accuracy                           0.86       294
   macro avg       0.75      0.64      0.67       294
weighted avg       0.84      0.86      0.84       294



In [13]:
# DECISION TREE PIPELINE
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', DecisionTreeClassifier(random_state=42))])

In [14]:
# HYPERPARAMETER TUNING FOR DECISION TREE
param_grid = {'classifier__max_depth': [3, 5, 10, None],
              'classifier__min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

print("Best parameters for Decision Tree:", grid_search.best_params_)
y_pred_dt = grid_search.predict(X_test)
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))

Best parameters for Decision Tree: {'classifier__max_depth': 3, 'classifier__min_samples_split': 2}
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.91       247
           1       0.48      0.21      0.29        47

    accuracy                           0.84       294
   macro avg       0.67      0.58      0.60       294
weighted avg       0.80      0.84      0.81       294

