## Part 1: Preprocessing

In [23]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [24]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [25]:
#find null values
for column in attrition_df.columns:
    print(f"Column {column} has {attrition_df[column].isnull().sum()} null values")

Column Age has 0 null values
Column Attrition has 0 null values
Column BusinessTravel has 0 null values
Column Department has 0 null values
Column DistanceFromHome has 0 null values
Column Education has 0 null values
Column EducationField has 0 null values
Column EnvironmentSatisfaction has 0 null values
Column HourlyRate has 0 null values
Column JobInvolvement has 0 null values
Column JobLevel has 0 null values
Column JobRole has 0 null values
Column JobSatisfaction has 0 null values
Column MaritalStatus has 0 null values
Column NumCompaniesWorked has 0 null values
Column OverTime has 0 null values
Column PercentSalaryHike has 0 null values
Column PerformanceRating has 0 null values
Column RelationshipSatisfaction has 0 null values
Column StockOptionLevel has 0 null values
Column TotalWorkingYears has 0 null values
Column TrainingTimesLastYear has 0 null values
Column WorkLifeBalance has 0 null values
Column YearsAtCompany has 0 null values
Column YearsInCurrentRole has 0 null values


In [26]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()
#y_df.nunique()
y_df.tail



<bound method NDFrame.tail of      Attrition              Department
0          Yes                   Sales
1           No  Research & Development
2          Yes  Research & Development
3           No  Research & Development
4           No  Research & Development
...        ...                     ...
1465        No  Research & Development
1466        No  Research & Development
1467        No  Research & Development
1468        No                   Sales
1469        No  Research & Development

[1470 rows x 2 columns]>

In [27]:
# Create a list of at least 10 column names to use as X data
columns = ['Age', 'DistanceFromHome',  'EnvironmentSatisfaction', 'HourlyRate', 'NumCompaniesWorked','YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'JobSatisfaction']
'JobSatisfaction'


# Create X_df using your selected columns
X_df = attrition_df[columns]


# Show the data types for X_df
X_df.dtypes



Age                        int64
DistanceFromHome           int64
EnvironmentSatisfaction    int64
HourlyRate                 int64
NumCompaniesWorked         int64
YearsAtCompany             int64
YearsInCurrentRole         int64
YearsSinceLastPromotion    int64
YearsWithCurrManager       int64
JobSatisfaction            int64
dtype: object

In [28]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split



In [29]:
#show column names and list values of each column
X_df.columns


Index(['Age', 'DistanceFromHome', 'EnvironmentSatisfaction', 'HourlyRate',
       'NumCompaniesWorked', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'JobSatisfaction'],
      dtype='object')

In [30]:
# Convert your X data to numeric data types however you see fit
X_df = X_df.apply(pd.to_numeric, errors='coerce')
X_df.dtypes
# Add new code cells as necessary



Age                        int64
DistanceFromHome           int64
EnvironmentSatisfaction    int64
HourlyRate                 int64
NumCompaniesWorked         int64
YearsAtCompany             int64
YearsInCurrentRole         int64
YearsSinceLastPromotion    int64
YearsWithCurrManager       int64
JobSatisfaction            int64
dtype: object

In [31]:
attrition_df['Attrition'] = attrition_df['Attrition'].replace({'Yes': 1, 'No': 0})

In [32]:
attrition_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [33]:
# Create a StandardScale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
 
dataset = load_iris()
object= StandardScaler()
 
# Splitting the independent and dependent variables
i_data = dataset.data
response = dataset.target
 
# standardization 
scale = object.fit_transform(i_data) 
print(scale)

# Scale the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)
X_train.head()


[[-9.00681170e-01  1.01900435e+00 -1.34022653e+00 -1.31544430e+00]
 [-1.14301691e+00 -1.31979479e-01 -1.34022653e+00 -1.31544430e+00]
 [-1.38535265e+00  3.28414053e-01 -1.39706395e+00 -1.31544430e+00]
 [-1.50652052e+00  9.82172869e-02 -1.28338910e+00 -1.31544430e+00]
 [-1.02184904e+00  1.24920112e+00 -1.34022653e+00 -1.31544430e+00]
 [-5.37177559e-01  1.93979142e+00 -1.16971425e+00 -1.05217993e+00]
 [-1.50652052e+00  7.88807586e-01 -1.34022653e+00 -1.18381211e+00]
 [-1.02184904e+00  7.88807586e-01 -1.28338910e+00 -1.31544430e+00]
 [-1.74885626e+00 -3.62176246e-01 -1.34022653e+00 -1.31544430e+00]
 [-1.14301691e+00  9.82172869e-02 -1.28338910e+00 -1.44707648e+00]
 [-5.37177559e-01  1.47939788e+00 -1.28338910e+00 -1.31544430e+00]
 [-1.26418478e+00  7.88807586e-01 -1.22655167e+00 -1.31544430e+00]
 [-1.26418478e+00 -1.31979479e-01 -1.34022653e+00 -1.44707648e+00]
 [-1.87002413e+00 -1.31979479e-01 -1.51073881e+00 -1.44707648e+00]
 [-5.25060772e-02  2.16998818e+00 -1.45390138e+00 -1.31544430e

Unnamed: 0,Age,DistanceFromHome,EnvironmentSatisfaction,HourlyRate,NumCompaniesWorked,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,JobSatisfaction
591,33,16,1,69,5,3,2,0,2,1
267,25,5,2,85,1,6,3,1,5,1
1236,36,13,2,96,5,2,2,2,2,1
788,28,10,3,59,3,8,7,1,7,3
1224,26,17,4,62,1,3,2,0,2,3


In [34]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)



# Fit the encoder to the training data
enc.fit(y_train)
enc.categories_
# Add new code cells as necessary


# Create two new variables by applying the encoder
y_train_cat = enc.transform(y_train)
y_test_cat = enc.transform(y_test)
y_train_cat[:5]
# Add new code cells as necessary

# to the training and testing data






array([[0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.]])

In [35]:
# Create a OneHotEncoder for the Attrition column
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)



# Fit the encoder to the training data
enc.fit(y_train)
enc.categories_
# Add new code cells as necessary


# Create two new variables by applying the encoder
# to the training and testing data
y_train_cat = enc.transform(y_train)
y_test_cat = enc.transform(y_test)
y_train_cat[:5]
# Add new code cells as necessary





array([[0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.]])

## Create, Compile, and Train the Model

In [36]:
# Find the number of columns in the X training data
X_train.shape[1]



# Create the input layer
input_layer = layers.Input(shape=(X_train.shape[1],))



# Create at least two shared layers
shared_layer1 = layers.Dense(units=8, activation='relu')
shared_layer2 = layers.Dense(units=8, activation='relu')



In [37]:
# Create a branch for Department with a hidden layer and an output layer
department_branch = shared_layer1(input_layer)
department_branch = shared_layer2(department_branch)
department_output = layers.Dense(units=3, activation='softmax')(department_branch)


# Create the hidden layer
hidden_layer = layers.Dense(units=8, activation='relu')(input_layer)



# Create the output layer
output_layer = layers.Dense(units=1, activation='sigmoid')(hidden_layer)




In [39]:
# Create a branch for Attrition with a hidden layer and an output layer
attrition_branch = layers.Dense(units=8, activation='relu')(hidden_layer)
attrition_output = layers.Dense(units=1, activation='sigmoid')(attrition_branch)



# Create the hidden layer
hidden_layer = layers.Dense(units=8, activation='relu')(input_layer)


# Create the output layers
output_layer1 = layers.Dense(units=3, activation='softmax')(hidden_layer)   
output_layer2 = layers.Dense(units=1, activation='sigmoid')(hidden_layer)



In [40]:
# Create the model
model = Model(inputs=input_layer, outputs=[output_layer1, output_layer2])



# Compile the model
model.compile(optimizer='adam',
              loss={'dense_7': 'categorical_crossentropy', 'dense_8': 'binary_crossentropy'},
              metrics=['accuracy'])


# Summarize the model
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 10)]                 0         []                            
                                                                                                  
 dense_12 (Dense)            (None, 8)                    88        ['input_2[0][0]']             
                                                                                                  
 dense_13 (Dense)            (None, 3)                    27        ['dense_12[0][0]']            
                                                                                                  
 dense_14 (Dense)            (None, 1)                    9         ['dense_12[0][0]']            
                                                                                              

In [48]:
# Train the model
model.fit(X_train_scaled,
          {'dense_7': y_train_cat, 'dense_8': y_train},
          epochs=100,
          shuffle=True,
          verbose=2)



Epoch 1/100


ValueError: in user code:

    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 248, in __call__
        y_true = self._conform_to_outputs(y_pred, y_true)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 63, in _conform_to_outputs
        struct = map_to_output_names(outputs, self._output_names, struct)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 819, in map_to_output_names
        raise ValueError(

    ValueError: Found unexpected losses or metrics that do not correspond to any Model output: dict_keys(['dense_7', 'dense_8']). Valid mode output names: ['dense_13', 'dense_14']. Received struct is: {'dense_7': <tf.Tensor 'IteratorGetNext:1' shape=(None, 5) dtype=float32>, 'dense_8': <tf.Tensor 'IteratorGetNext:2' shape=(None, 2) dtype=string>}.


In [46]:
# Evaluate the model with the testing data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_cat, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")



ValueError: in user code:

    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 2066, in test_function  *
        return step_function(self, iterator)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 2049, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 2037, in run_step  **
        outputs = model.test_step(data)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1919, in test_step
        self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 252, in __call__
        self.build(y_pred)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 193, in build
        self._losses = self._conform_to_outputs(y_pred, self._losses)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 63, in _conform_to_outputs
        struct = map_to_output_names(outputs, self._output_names, struct)
    File "/Users/ryanbusman/anaconda3/envs/dev/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 819, in map_to_output_names
        raise ValueError(

    ValueError: Found unexpected losses or metrics that do not correspond to any Model output: dict_keys(['dense_7', 'dense_8']). Valid mode output names: ['dense_13', 'dense_14']. Received struct is: {'dense_7': 'categorical_crossentropy', 'dense_8': 'binary_crossentropy'}.


In [47]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {model_accuracy[0]}, Attrition Accuracy: {model_accuracy[1]}")



NameError: name 'model_accuracy' is not defined

# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

Is accuracy the best metric to use on this data? Why or why not?
Accuracy is appropriate for measuring Department predictions because the data has some balance, R&D (65%), Sales (30%), and HR (4%). The accuracy score of 80% seems manageable for predicting a department that may be a better fit for the employee. Also, the cost of incorrectly predicting a new department is low because you can engage the employee in a career developemnt discussion to assess their interest in switching to a new department and establish the best approach for a potential career change.
Precision may be better for measuring Attrition predictions because the data is imbalanced, No (83%), Yes (17%). This is a contributing factor to the high accuracy score of 87%. Also, there may be a high cost to incorrectly predicting that an employee is going to stay, but they actually leave. Companies make investments in their employees to build their knowledge and develop relevant skills. They want to keep them if they can.
What activation functions did you choose for your output layers, and why?
department_output: The model predicts 3 departments. I used Softmax because it is recommended for mulit-class classification.
attribution_output: The model predicts a binary Yes/No attibution. I used Sigmoid because it is recommended for binary classification.
Can you name a few ways that this model might be improved?
Add or Reduce the number of inputs
Increase the number of layers
Increase the number of neurons
Increase the number of epochs
Consider splitting this into separate models to better focus models on predicting employee attrition and departments better suited for them.