## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [4]:
# Create a list of at least 10 column names to use as X data
X_data = ['Age', 'Education', 'HourlyRate', 'JobLevel', 'MaritalStatus',
          'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears',
          'WorkLifeBalance', 'YearsAtCompany'
]

# Create X_df using your selected columns
X_df = attrition_df[X_data]

# Show the data types for X_df
X_df.dtypes


Unnamed: 0,0
Age,int64
Education,int64
HourlyRate,int64
JobLevel,int64
MaritalStatus,object
NumCompaniesWorked,int64
PercentSalaryHike,int64
TotalWorkingYears,int64
WorkLifeBalance,int64
YearsAtCompany,int64


In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=1)

In [6]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_numeric = pd.get_dummies(X_df)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Displaying the first 5 rows of the coverted data
X_numeric.head()

Unnamed: 0,Age,Education,HourlyRate,JobLevel,NumCompaniesWorked,PercentSalaryHike,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,2,94,2,8,11,8,1,6,False,False,True
1,49,1,61,2,1,23,10,3,10,False,True,False
2,37,2,92,1,6,15,7,3,0,False,False,True
3,33,4,56,1,1,11,8,3,8,False,True,False
4,27,1,40,1,9,12,6,3,2,False,True,False


In [7]:
# Checking the data type after encoding before scaling the data
X_train.dtypes

Unnamed: 0,0
Age,int64
Education,int64
HourlyRate,int64
JobLevel,int64
NumCompaniesWorked,int64
PercentSalaryHike,int64
TotalWorkingYears,int64
WorkLifeBalance,int64
YearsAtCompany,int64
MaritalStatus_Divorced,bool


In [8]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Create a OneHotEncoder for the Department column
dept_OHE = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
dept_ecoded_train = dept_OHE.fit_transform(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
dept_ecoded_test = dept_OHE.transform(y_test[['Department']])

dept_ecoded_train
dept_ecoded_test

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [10]:
# Create a OneHotEncoder for the Attrition column
attrition_OHE = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
attrition_encoded_train = attrition_OHE.fit_transform(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
attrition_encoded_test = attrition_OHE.transform(y_test[['Attrition']])

attrition_encoded_train
attrition_encoded_test


array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.

## Create, Compile, and Train the Model

In [11]:
# Find the number of columns in the X training data
X_columns = X_train_scaled.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(X_columns,), name='input_features')

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu', name='shared1')(input_layer)
shared_layer2 = layers.Dense(32, activation='relu', name='shared2')(shared_layer1)

In [12]:
# Create a branch for Department
# with a hidden layer and an output layer
num_classes_department = dept_ecoded_train.shape[1]

# Create the hidden layer
department_hidden_layer = layers.Dense(32, activation='relu', name='department_hidden')(shared_layer2)

# Create the output layer
department_output_layer = layers.Dense(num_classes_department, activation='softmax', name='department_output')(department_hidden_layer)


In [13]:
# Create a branch for Attrition
# with a hidden layer and an output layer
num_classes_attrition = attrition_encoded_train.shape[1]

# Create the hidden layer
attrition_hidden_layer = layers.Dense(32, activation='relu', name='attrition_hidden')(shared_layer2)

# Create the output layer
attrition_output_layer = layers.Dense(num_classes_attrition, activation='softmax', name='attrition_output')(attrition_hidden_layer)


In [15]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output_layer, attrition_output_layer])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'categorical_crossentropy'},
              metrics={
                  'department_output': [CategoricalAccuracy(name='department_accuracy')],
                  'attrition_output': [CategoricalAccuracy(name='attrition_accuracy')]
              }
)

# Summarize the model
model.summary()

In [16]:
print(X_train.shape)             # Should be (1102, num_features)
print(dept_ecoded_train.shape)   # Should be (1102, num_classes_for_department)
print(attrition_encoded_train.shape)  # Should be (1102, num_classes_for_attrition)

(1102, 12)
(1102, 3)
(1102, 2)


In [17]:
# Train the model
history = model.fit(
    X_train_scaled,
    {
        'department_output': dept_ecoded_train,
        'attrition_output': attrition_encoded_train
    },
    epochs=100,
    batch_size=32,
    verbose=1,
    validation_data=(X_test_scaled, {'department_output': dept_ecoded_test, 'attrition_output': attrition_encoded_test})
)


Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - attrition_output_attrition_accuracy: 0.8530 - department_output_department_accuracy: 0.5858 - loss: 1.5068 - val_attrition_output_attrition_accuracy: 0.8152 - val_department_output_department_accuracy: 0.6413 - val_loss: 1.3073
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - attrition_output_attrition_accuracy: 0.8595 - department_output_department_accuracy: 0.6617 - loss: 1.1905 - val_attrition_output_attrition_accuracy: 0.8152 - val_department_output_department_accuracy: 0.6413 - val_loss: 1.2293
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - attrition_output_attrition_accuracy: 0.8530 - department_output_department_accuracy: 0.6522 - loss: 1.1706 - val_attrition_output_attrition_accuracy: 0.8098 - val_department_output_department_accuracy: 0.6413 - val_loss: 1.2233
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [18]:
# Evaluate the model with the testing data
eval_results = model.evaluate(X_test_scaled, {'department_output': dept_ecoded_test, 'attrition_output': attrition_encoded_test})
print(eval_results)

test_loss = eval_results[0]
dept_loss = eval_results[1]
attrition_loss = eval_results[2]

print(f"Test Loss: {test_loss}")
print(f"Department Loss: {dept_loss}")
print(f"Attrition Loss: {attrition_loss}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_attrition_accuracy: 0.7709 - department_output_department_accuracy: 0.5852 - loss: 2.7290 
[2.6916165351867676, 0.7771739363670349, 0.5896739363670349]
Test Loss: 2.6916165351867676
Department Loss: 0.7771739363670349
Attrition Loss: 0.5896739363670349


In [19]:
# Print the accuracy for both department and attrition
dept_pred = model.predict(X_test_scaled)[0]
attrition_pred = model.predict(X_test_scaled)[1]

dept_accuracy = np.mean(np.argmax(dept_pred, axis=1) == np.argmax(dept_ecoded_test, axis=1))
attrition_accuracy = np.mean(np.argmax(attrition_pred, axis=1) == np.argmax(attrition_encoded_test, axis=1))

print(f"Department Accuracy: {dept_accuracy}")
print(f"Attrition Accuracy: {attrition_accuracy}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Department Accuracy: 0.5896739130434783
Attrition Accuracy: 0.7771739130434783


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. No accuracy is not the best metric to use on this data. I feel that some of the columns of data have imbalances which causes a skewl in the accuracy data. Using the Precision, recall, and F1-score metrics could better for this data in order to better measure how the different columns will affect the output of the Attrition and Department target variables.
2. I chose Softmax for both the department and attrition outputs. Softmax was used for multi-class classification in the department_output to present the different probabilities and also for the three classes to add up to 1. For attrition_output, softmax was also used to get probabilities of each class. Even though the attrition_output could have been a binary representation and sigmoid could have been used but we need to see the ratio between the Yes and No results.
3.The model could be improved by getting more data. The features in the data could be modified to help with the refinement of the data versus the target variables. There could be hyperparameter tuning to best optimize the model. Finally other models could be used like RBM or CNN to see if these models could analyze the data better.