## Part 1: Preprocessing

In [122]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow as tf

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [123]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [124]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [125]:
from os import X_OK
# Create a list of at least 10 column names to use as X data
features = ['Age', 'JobSatisfaction', 'YearsAtCompany', 'YearsInCurrentRole', 'TotalWorkingYears', 'HourlyRate', 'NumCompaniesWorked', 'WorkLifeBalance', 'MaritalStatus', 'PerformanceRating']

# Create X_df using your selected columns
x_df = attrition_df[features]

# Show the data types for X_df
x_df.dtypes

Unnamed: 0,0
Age,int64
JobSatisfaction,int64
YearsAtCompany,int64
YearsInCurrentRole,int64
TotalWorkingYears,int64
HourlyRate,int64
NumCompaniesWorked,int64
WorkLifeBalance,int64
MaritalStatus,object
PerformanceRating,int64


In [126]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df,
                                                    test_size=0.2,
                                                    random_state=1)

In [127]:
from sklearn.preprocessing import OneHotEncoder

# Create one-hot encoder
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

# Fit and transform the marital status column
marital_encoded_train = encoder.fit_transform(x_train[['MaritalStatus']])

# Transform test data separately (no fitting)
marital_encoded_test = encoder.transform(x_test[['MaritalStatus']])

# Convert to DataFrame with meaningful column names
feature_names = encoder.get_feature_names_out(['MaritalStatus'])
print("\nEncoded feature names:")
print(feature_names)

marital_encoded_train_df = pd.DataFrame(
    marital_encoded_train,
    columns=feature_names,
    index=x_train.index
)

marital_encoded_test_df = pd.DataFrame(
    marital_encoded_test,
    columns=feature_names,
    index=x_test.index
)


Encoded feature names:
['MaritalStatus_Divorced' 'MaritalStatus_Married' 'MaritalStatus_Single']


In [128]:
# Remove original MaritalStatus column
x_train = x_train.drop('MaritalStatus', axis=1)
x_test = x_test.drop('MaritalStatus', axis=1)

# Combine with encoded columns
x_train = pd.concat([x_train, marital_encoded_train_df], axis=1)
x_test = pd.concat([x_test, marital_encoded_test_df], axis=1)

# 3. Verify the combination worked
print("\nNew X_train columns (should include encoded marital status):")
print(x_train.columns)


New X_train columns (should include encoded marital status):
Index(['Age', 'JobSatisfaction', 'YearsAtCompany', 'YearsInCurrentRole',
       'TotalWorkingYears', 'HourlyRate', 'NumCompaniesWorked',
       'WorkLifeBalance', 'PerformanceRating', 'MaritalStatus_Divorced',
       'MaritalStatus_Married', 'MaritalStatus_Single'],
      dtype='object')


In [129]:
print("Current x_train data types:")
print(x_train.dtypes)
print("\nCurrent x_test data types:")
print(x_test.dtypes)

Current x_train data types:
Age                         int64
JobSatisfaction             int64
YearsAtCompany              int64
YearsInCurrentRole          int64
TotalWorkingYears           int64
HourlyRate                  int64
NumCompaniesWorked          int64
WorkLifeBalance             int64
PerformanceRating           int64
MaritalStatus_Divorced    float64
MaritalStatus_Married     float64
MaritalStatus_Single      float64
dtype: object

Current x_test data types:
Age                         int64
JobSatisfaction             int64
YearsAtCompany              int64
YearsInCurrentRole          int64
TotalWorkingYears           int64
HourlyRate                  int64
NumCompaniesWorked          int64
WorkLifeBalance             int64
PerformanceRating           int64
MaritalStatus_Divorced    float64
MaritalStatus_Married     float64
MaritalStatus_Single      float64
dtype: object


In [130]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
x_train_scaled = scaler.fit_transform(x_train)

# Transform the testing data
x_test_scaled = scaler.transform(x_test)

In [131]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
department_encoded_train = department_encoder.fit_transform(y_train[['Department']])
department_encoded_test = department_encoder.transform(y_test[['Department']])

In [132]:
# Create a OneHotEncoder for the Department column
attrition_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Encode Attrition column
attrition_encoded_train = attrition_encoder.fit_transform(y_train[['Attrition']])
attrition_encoded_test = attrition_encoder.transform(y_test[['Attrition']])

## Create, Compile, and Train the Model

In [133]:
# Find the number of columns in the X training data
number_input_features = x_train_scaled.shape[1]
num_dept_categories = len(department_encoder.categories_[0])

display(number_input_features)
display(num_dept_categories)

12

3

In [134]:
# Create the input layer
input_layer = layers.Input(shape=(number_input_features,))
# Create at least two shared layers
x = layers.Dense(64, activation='relu')(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.3)(x)

x = layers.Dense(32, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.2)(x)

In [135]:
# Department prediction branch
dept_hidden = layers.Dense(16, activation='relu')(x)
dept_hidden = layers.BatchNormalization()(dept_hidden)
dept_output = layers.Dense(num_dept_categories, activation='softmax', name='department')(dept_hidden)

In [136]:
# Attrition prediction branch
attrition_hidden = layers.Dense(16, activation='relu')(x)
attrition_hidden = layers.BatchNormalization()(attrition_hidden)
attrition_output = layers.Dense(2, activation='sigmoid', name='attrition')(attrition_hidden)

In [137]:
# Create model
model = Model(inputs=input_layer, outputs=[dept_output, attrition_output])

# Compile model
model.compile(
    optimizer='adam',
    loss={
        'department': 'categorical_crossentropy',
        'attrition': 'binary_crossentropy'
    },
    metrics={
        'department': ['accuracy'],
        'attrition': ['accuracy']
    }
)

# Summarize model
model.summary()

In [138]:
# Train the model
history = model.fit(
    x=x_train_scaled,
    y={
        'department': department_encoded_train,
        'attrition': attrition_encoded_train
    },
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
    ],
    verbose=1
)


Epoch 1/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - attrition_accuracy: 0.5607 - department_accuracy: 0.3445 - loss: 2.2917 - val_attrition_accuracy: 0.7373 - val_department_accuracy: 0.4703 - val_loss: 1.7026
Epoch 2/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_accuracy: 0.5998 - department_accuracy: 0.3953 - loss: 1.9695 - val_attrition_accuracy: 0.8093 - val_department_accuracy: 0.4873 - val_loss: 1.6364
Epoch 3/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - attrition_accuracy: 0.6197 - department_accuracy: 0.4367 - loss: 1.7951 - val_attrition_accuracy: 0.8178 - val_department_accuracy: 0.5593 - val_loss: 1.5677
Epoch 4/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_accuracy: 0.6979 - department_accuracy: 0.4434 - loss: 1.6710 - val_attrition_accuracy: 0.8347 - val_department_accuracy: 0.5932 - val_loss: 1.5033
Epoch 5/50
[1m30/30[0

In [139]:
# Evaluate the model with the testing data
test_results = model.evaluate(
    x_test_scaled,
    {
        'department': department_encoded_test,
        'attrition': attrition_encoded_test
    },
    verbose=1
)

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_accuracy: 0.7835 - department_accuracy: 0.6315 - loss: 1.2984 


In [144]:
# Print the accuracy for both department and attrition
print("\nTest Results:")
print(f"Total Loss: {test_results[0]:.4f}")
print(f"Attrition Accuracy: {test_results[1]:.4f}")
print(f"Department Accuracy: {test_results[2]:.4f}")


Test Results:
Total Loss: 1.2390
Attrition Accuracy: 0.8027
Department Accuracy: 0.6395


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

1. No, accuracy alone doesn't tell the full story, especially when looking at our loss numbers. Looking at just the accuracy percentage can hide important problems in how our model is performing. We need additional metrics to understand if our model is actually learning meaningful patterns or just making obvious guesses. Especially in terms of attrition where in some companies you may have high rates in one way or the other so by default it may be more 'acurate'.
2. For the departments (which had multiple possible outputs), we used Softmax because it's good at handling multiple categories and gives us percentages that add up to 100%. For attrition (which was just yes/no), we used Sigmoid because it works well for these true/false type predictions.
3. First, we could do better analysis at the start to understand how our different pieces of data relate to each other - this would help us pick better features for our model to learn from. Also, since we have more "No" answers than "Yes" answers in our attrition data, we could use class weights to help balance this out and make our model pay more attention to the less common cases.

