## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [50]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [51]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [52]:
# Create a list of at least 10 column names to use as X data
selected_columns = [
    'Education',
    'Age',
    'DistanceFromHome',
    'JobSatisfaction',
    'OverTime',
    'StockOptionLevel',
    'WorkLifeBalance',
    'YearsAtCompany',
    'YearsSinceLastPromotion',
    'NumCompaniesWorked'
]

# Create X_df using your selected columns
X_df = attrition_df[selected_columns]

# Show the data types for X_df
X_df.dtypes



Unnamed: 0,0
Education,int64
Age,int64
DistanceFromHome,int64
JobSatisfaction,int64
OverTime,object
StockOptionLevel,int64
WorkLifeBalance,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
NumCompaniesWorked,int64


In [53]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)


In [54]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Convert "OverTime" to numeric in both training and testing sets
X_train['OverTime'] = X_train['OverTime'].map({'No': 0, 'Yes': 1}).astype(int)
X_test['OverTime'] = X_test['OverTime'].map({'No': 0, 'Yes': 1}).astype(int)

X_df.loc[:, 'OverTime'] = X_df['OverTime'].map({'No': 0, 'Yes': 1})
X_df['OverTime'].value_counts()

Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
0,1054
1,416


In [57]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scaler.fit_transform(X_train)

# Scale the training and testing data
X_test_scaled = scaler.transform(X_test)


In [61]:
# Create a OneHotEncoder for the Department column
from sklearn.preprocessing import OneHotEncoder
department_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
department_encoder.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_department_encoded = department_encoder.transform(y_train[['Department']])
y_test_department_encoded = department_encoder.transform(y_test[['Department']])
y_train_department_encoded


array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [66]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
attrition_encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded = attrition_encoder.transform(y_train[['Attrition']])
y_test_encoded = attrition_encoder.transform(y_test[['Attrition']])

y_train_encoded[:]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

## Create, Compile, and Train the Model

In [72]:
# Find the number of columns in the X training data
from tensorflow.keras.layers import Input, Dense
input_dim = X_train_scaled.shape[1]

# Create the input layer
input_layer = Input(shape=(input_dim,))

# Create at least two shared layers
shared_layer_1 = Dense(units=64, activation='relu')(input_layer)
shared_layer_2 = Dense(units=32, activation='relu')(shared_layer_1)

In [74]:
# Create a branch for Department
# with a hidden layer and an output layer
# Create the hidden layer
department_hidden_layer = Dense(units=16, activation='relu')(shared_layer_2)

# Create the output layer
department_output_layer = Dense(units=3, activation='softmax', name='department_output')(department_hidden_layer)


In [76]:
# Create a branch for Attrition
# with a hidden layer and an output layer
# Create the hidden layer
attrition_hidden_layer = Dense(units=16, activation='relu')(shared_layer_2)

# Create the output layer
attrition_output_layer = Dense(units=1, activation='sigmoid', name='attrition_output')(attrition_hidden_layer)


In [113]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output_layer, attrition_output_layer])

# Compile the model

model.compile(
    optimizer='adam',
    loss={
        'department_output': 'categorical_crossentropy',
        'attrition_output': 'binary_crossentropy'
    },
    metrics={
        'department_output': 'accuracy',
        'attrition_output': 'accuracy'
    }
)



# Summarize the model
model.summary()

In [114]:
# Train the model
history = model.fit(
    X_train_scaled,
    {'department_output': y_train_department_encoded, 'attrition_output': y_train_encoded},
    validation_data=(X_test_scaled, {'department_output': y_test_department_encoded, 'attrition_output': y_test_encoded}),
    epochs=100,
    batch_size=34,
    verbose=1
)

Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 12ms/step - attrition_output_accuracy: 0.9675 - department_output_accuracy: 0.9220 - loss: 0.3361 - val_attrition_output_accuracy: 0.8333 - val_department_output_accuracy: 0.5476 - val_loss: 3.0840
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.9754 - department_output_accuracy: 0.9159 - loss: 0.3209 - val_attrition_output_accuracy: 0.8367 - val_department_output_accuracy: 0.5510 - val_loss: 3.0648
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.9728 - department_output_accuracy: 0.9217 - loss: 0.3255 - val_attrition_output_accuracy: 0.8231 - val_department_output_accuracy: 0.5476 - val_loss: 3.0756
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.9810 - department_output_accuracy: 0.9112 - loss: 0.3529 - val_

In [116]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled, [y_test_department_encoded, y_test_encoded], verbose=1)

# Print test_results to see the output structure
print("Test Results (Raw):", test_results)
print("Length of Test Results:", len(test_results))


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8211 - department_output_accuracy: 0.5510 - loss: 4.0031 
Test Results (Raw): [3.7975194454193115, 0.819727897644043, 0.5612244606018066]
Length of Test Results: 3


In [98]:
# Evaluate the model with the testing data

test_results = model.evaluate(X_test_scaled, [y_test_department_encoded, y_test_encoded], verbose=1)

print(f"Test Results: {test_results}")
overall_loss = test_results[0]
department_output_loss = test_results[1]
attrition_output_loss = test_results[2]



[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8371 - department_output_accuracy: 0.5484 - loss: 2.8875 
Test Results: [2.8383493423461914, 0.8333333134651184, 0.5476190447807312]


In [119]:
# Extract and print accuracy for each output
overall_loss = test_results[0]
attrition_output_accuracy = test_results[1]
department_output_accuracy = test_results[2]

print(f"Department predictions accuracy: {department_output_accuracy}")
print(f"Attrition predictions accuracy: {attrition_output_accuracy}")




Department predictions accuracy: 0.5612244606018066
Attrition predictions accuracy: 0.819727897644043


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy may not be the best metric if there's class imbalance; precision, recall, or F1-score could offer more insight.
2. Softmax was used for department prediction as it's a multi-class problem, while sigmoid was used for binary attrition prediction
3. The model could be improved with deeper layers, feature engineering, regularization, class weights, and hyperparameter tuning.