In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, f1_score

#Load data
df = pd.read_csv('cleaned_data.csv')

In [None]:
#Ensure all attributes are numeric

#Identify any other columns that are still non-numeric
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['department', 'salary']

In [None]:
#Inspect unique values in the columns of 'department'
df['department'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [6]:
#Map 'department' column

department_mapping = {
    'sales': 0,
    'accounting': 1,
    'hr': 2,
    'technical': 3,
    'support': 4,
    'management': 5,
    'IT': 6,
    'product_mng': 7,
    'marketing': 8,
    'RandD': 9
}

df['department'] = df['department'].map(department_mapping)
df['department'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
#Inspect unique values in the columns of 'salary'
df['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [8]:
salary_mapping = {
    'low': 0,
    'medium': 1,
    'high': 2
}

df['salary'] = df['salary'].map(salary_mapping)
df['salary'].unique()

array([0, 1, 2])

In [9]:
x= df.drop('left', axis=1)
y= df['left']

#Define train/test splits
splits = [(0.85, 0.15), (0.75, 0.25), (0.65, 0.35)]
random_state = 42

#Loop over each split, train and evaluate
for train_pct, test_pct in splits:
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_pct, random_state=random_state, shuffle=True
    )

    permutation = np.random.permutation(len(x_train))
    x_train_shuffled = x_train.iloc[permutation].reset_index(drop=True)
    y_train_shuffled = y_train.iloc[permutation].reset_index(drop=True)

    #Initialize the classifier
    clf = SGDClassifier(random_state=random_state, max_iter=1000, tol=1e-3)

    #Setup an out-of-sample prediction for every training data point
    y_train_cv_pred = cross_val_predict(clf, x_train_shuffled, y_train_shuffled, cv=5, n_jobs= -1)

    #Evaluate the training set performance

    clf.fit(x_train_shuffled, y_train_shuffled)

    y_test_pred = clf.predict(x_test)

    # Calculate evaluation metrics for the test set.
    cm = confusion_matrix(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)


    # Display the results.
    print("\nTest Set Results:")
    print("Confusion Matrix:")
    print(cm)
    print("Precision: {:.4f}".format(precision))
    print("F1 Score: {:.4f}".format(f1))
    print("============================================\n")


Test Set Results:
Confusion Matrix:
[[1426  103]
 [ 220   90]]
Precision: 0.4663
F1 Score: 0.3579


Test Set Results:
Confusion Matrix:
[[2525    9]
 [ 531    0]]
Precision: 0.0000
F1 Score: 0.0000


Test Set Results:
Confusion Matrix:
[[3513   33]
 [ 740    5]]
Precision: 0.1316
F1 Score: 0.0128

