In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, precision_score, recall_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load dataset
df = pd.read_csv('/Users/salmakaffafy/Desktop/Employee_Attrition/Employee-Attrition-Analysis-and-Turnover-Predictions/Attrition Dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

# 1. Data Exploration
print(df.describe())  # Statistical summary
print(df['Age'].hist())  # Histogram for Age

# 2. Data Cleaning

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

# Remove outliers in MonthlyIncome
q1 = df['MonthlyIncome'].quantile(0.25)
q3 = df['MonthlyIncome'].quantile(0.75)
iqr = q3 - q1
df = df[(df['MonthlyIncome'] >= q1 - 1.5*iqr) & (df['MonthlyIncome'] <= q3 + 1.5*iqr)]

# 3. Data Encoding
# One-hot encoding for Department
df_encoded = pd.get_dummies(df, columns=['Department', 'BusinessTravel'])

# 4. Data Labeling
# Label encode Attrition ('Yes'/'No' to 1/0)
label_encoder = LabelEncoder()
df_encoded['Attrition'] = label_encoder.fit_transform(df['Attrition'])

# Train-test split for classification ;
X = df_encoded.drop('Attrition', axis=1)
y = df_encoded['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization for numerical values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['Age', 'MonthlyIncome']])
X_test_scaled = scaler.transform(X_test[['Age', 'MonthlyIncome']])


In [None]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=10000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

# Dictionary to store results
results = {}

# Train and evaluate models
for model_name, model in models.items():
    print(f'Training {model_name}...')

    # Train model
    model.fit(X_train_scaled, y_train)

    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    train_loss = log_loss(y_train, model.predict_proba(X_train_scaled))
    test_loss = log_loss(y_test, model.predict_proba(X_test_scaled))
    f1 = f1_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    conf_matrix = confusion_matrix(y_test, y_test_pred)

    # Store results
    results[model_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train Loss': train_loss,
        'Test Loss': test_loss,
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall,
        'Confusion Matrix': conf_matrix
    }

    # Print results
    print(f'{model_name} Results:')
    print(f'Train Accuracy: {train_accuracy:.4f}')
    print(f'Test Accuracy: {test_accuracy:.4f}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Test Loss: {test_loss:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('\n')

# Plot Confusion Matrices
for model_name, metrics in results.items():
    plt.figure(figsize=(6,4))
    sns.heatmap(metrics['Confusion Matrix'], annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Summary
summary = pd.DataFrame(results).transpose()
print(summary)