In [3]:
import pandas as pd

data = pd.read_csv("attrition_availabledata_04.csv")

# Display the structure of the dataset
print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hrs                      2940 non-null   float64
 1   absences                 2940 non-null   float64
 2   JobInvolvement           2940 non-null   float64
 3   PerformanceRating        2940 non-null   float64
 4   EnvironmentSatisfaction  2940 non-null   float64
 5   JobSatisfaction          2940 non-null   float64
 6   WorkLifeBalance          2940 non-null   float64
 7   Age                      2940 non-null   float64
 8   BusinessTravel           2940 non-null   object 
 9   Department               2940 non-null   object 
 10  DistanceFromHome         2940 non-null   float64
 11  Education                2940 non-null   float64
 12  EducationField           2940 non-null   object 
 13  EmployeeCount            2940 non-null   float64
 14  EmployeeID              

In [5]:
# Basic information about the data
print("Shape of the dataset:", data.shape)
print("Data Types:\n", data.dtypes.value_counts())
print("Missing Values:\n", data.isnull().sum())

# Identify constant columns
constant_cols = [col for col in data.columns if data[col].nunique() == 1]
print("Constant Columns:", constant_cols)

# Check the target variable distribution
attrition_dist = data["Attrition"].value_counts(normalize=True) * 100
print("Attrition Class Distribution:\n", attrition_dist)


Shape of the dataset: (2940, 31)
Data Types:
 float64    23
object      8
Name: count, dtype: int64
Missing Values:
 hrs                        0
absences                   0
JobInvolvement             0
PerformanceRating          0
EnvironmentSatisfaction    0
JobSatisfaction            0
WorkLifeBalance            0
Age                        0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
Attrition    

In [11]:
# Drop constant columns
data_cleaned = data.drop(columns=["EmployeeCount", "Over18", "StandardHours"])

# Separate features and target variable
X = data_cleaned.drop(columns=["Attrition"])
y = data_cleaned["Attrition"].map({"Yes": 1, "No": 0})

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (2352, 27)
Test set shape: (588, 27)


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns
numerical_cols = X_train.select_dtypes(exclude=["object"]).columns

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import time

# Models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=4),
    "KNN": KNeighborsClassifier(),
    "Dummy": DummyClassifier(strategy="most_frequent"),
}

# Train and evaluate
results = []
for name, model in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    # Measure training time
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Evaluate
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    
    # Save results
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Balanced Accuracy": bal_acc,
        "Training Time (s)": training_time,
    })

# Display results
results_df = pd.DataFrame(results)
print(results_df)


           Model  Accuracy  Balanced Accuracy  Training Time (s)
0  Decision Tree  0.923469           0.843932           0.041796
1            KNN  0.852041           0.624240           0.007456
2          Dummy  0.828231           0.500000           0.007996
