In [1]:
# =============================================================================
# NOTEBOOK 1: DATA PREPROCESSING
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("ðŸ“Š NOTEBOOK 1: DATA PREPROCESSING")
print("=" * 50)

# Load dataset
df = pd.read_excel('/content/IBM HR Dataset.xlsx')

print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Missing values:\n{df.isnull().sum()}")

# Basic data info
print("\nData Types:")
print(df.dtypes)

print("\nTarget Variable Distribution:")
print(df['Attrition'].value_counts())

# Remove unnecessary columns
columns_to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']
df_clean = df.drop(columns=columns_to_drop, errors='ignore')

print(f"\nAfter cleaning: {df_clean.shape}")

# Identify categorical and numerical columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col != 'Attrition']
numerical_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Encode categorical variables
df_encoded = df_clean.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col + '_Encoded'] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}: {df_clean[col].nunique()} categories")

# Remove original categorical columns
df_encoded = df_encoded.drop(columns=categorical_cols)

# Prepare features and target
all_features = [col for col in df_encoded.columns if col != 'Attrition']
X = df_encoded[all_features]
y = df_encoded['Attrition'].map({'Yes': 1, 'No': 0})

print(f"\nFinal feature set: {len(all_features)} features")
print(f"Target distribution:\n{y.value_counts()}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training attrition rate: {(y_train.sum() / len(y_train) * 100):.2f}%")
print(f"Test attrition rate: {(y_test.sum() / len(y_test) * 100):.2f}%")

# Scale numerical features
numerical_to_scale = [col for col in numerical_cols if col in X_train.columns and X_train[col].nunique() > 2]
binary_cols = [col for col in numerical_cols if col in X_train.columns and X_train[col].nunique() == 2]

print(f"\nNumerical features to scale: {len(numerical_to_scale)}")
print(f"Binary features (not scaled): {len(binary_cols)}")

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_to_scale] = scaler.fit_transform(X_train[numerical_to_scale])
X_test_scaled[numerical_to_scale] = scaler.transform(X_test[numerical_to_scale])

print("âœ… Data preprocessing completed successfully!")

# Save processed data
import joblib
joblib.dump(X_train_scaled, 'X_train_scaled.pkl')
joblib.dump(X_test_scaled, 'X_test_scaled.pkl')
joblib.dump(y_train, 'y_train.pkl')
joblib.dump(y_test, 'y_test.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

print("âœ… Processed data saved to disk!")

ðŸ“Š NOTEBOOK 1: DATA PREPROCESSING
Dataset Overview:
Shape: (1470, 35)
Columns: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
Missing values:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber