## Part 1: Preprocessing

In [12]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from keras.models import Model
from keras import layers
from sklearn.preprocessing import OneHotEncoder


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [13]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [14]:
# attrition_copy = attrition_df.copy()

# attrition_copy['Attrition'] = attrition_copy['Attrition'].map({'No': 0, 'Yes': 1})


# correlation = attrition_copy.corr()
# correlation['Attrition'].sort_values(ascending=False)

In [15]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [16]:
# Create a list of at least 10 column names to use as X data
selected_columns = ['Age', 'BusinessTravel', 'HourlyRate', 'DistanceFromHome', 'Education',
                     'MaritalStatus', 'JobSatisfaction', 'EnvironmentSatisfaction', 'YearsSinceLastPromotion', 'OverTime']
                    


# Create X_df using your selected columns
X_df = attrition_df[selected_columns]

# Show the data types for X_df
X_df.dtypes


Age                         int64
BusinessTravel             object
HourlyRate                  int64
DistanceFromHome            int64
Education                   int64
MaritalStatus              object
JobSatisfaction             int64
EnvironmentSatisfaction     int64
YearsSinceLastPromotion     int64
OverTime                   object
dtype: object

In [17]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)


In [18]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [19]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
print(X_train_scaled)

[[-0.40942386  0.12842688  0.87475435 ...  1.44758613  0.61581168
  -0.61581168]
 [-1.29889668  0.9190773  -0.50720476 ... -0.69080518  0.61581168
  -0.61581168]
 [-0.07587155  1.46264946  0.49785641 ... -0.69080518 -1.62387305
   1.62387305]
 ...
 [-0.18705565  0.42492079 -0.63283741 ... -0.69080518  0.61581168
  -0.61581168]
 [-1.41008078 -1.4034583   1.87981553 ... -0.69080518  0.61581168
  -0.61581168]
 [-0.63179206 -1.00813309 -0.00467417 ...  1.44758613 -1.62387305
   1.62387305]]


In [21]:
# Create a OneHotEncoder for the Department column
enc = OneHotEncoder(sparse=False)

# Fit the encoder to the training data
enc.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded = enc.transform(y_train[['Department']])
y_test_encoded = enc.transform(y_test[['Department']])
y_train_encoded



array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [22]:
# Create a OneHotEncoder for the Attrition column
enc = OneHotEncoder(sparse=False)

# Fit the encoder to the training data
enc.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded2 = enc.transform(y_train[['Attrition']])
y_test_encoded2 = enc.transform(y_test[['Attrition']])
y_train_encoded2




array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

## Create, Compile, and Train the Model

In [23]:
# Find the number of columns in the X training data
X_train.shape[1]

15

In [24]:
# Create the input layer
input_layer = layers.Input(shape=(X_train.shape[1],))

# Create at least two shared layers
shared_layer1 = layers.Dense(units=8, activation='relu', name='shared_layer1')(input_layer)
shared_layer2 = layers.Dense(units=8, activation='relu', name = 'shared_layer2')(shared_layer1)

In [25]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_branch = layers.Dense(units=8, activation='relu', name='department_hidden')(shared_layer2)


# Create the output layer
department_output = layers.Dense(units=3, activation='sigmoid', name='department_output')(department_branch)

In [26]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = layers.Dense(units=8, activation='relu', name='attrition_hidden')(shared_layer2)

# Create the output layer
attrition_output = layers.Dense(units=2, activation='sigmoid', name='attrition_output')(attrition_hidden)

In [27]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'binary_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [28]:
# Train the model
model.fit(X_train_scaled,
          {'department_output': y_train_encoded, 'attrition_output': y_train_encoded2},
          epochs=100,
          shuffle=True,
          verbose=2)

Epoch 1/100


35/35 - 2s - 54ms/step - attrition_output_accuracy: 0.4891 - department_output_accuracy: 0.4102 - loss: 1.4135
Epoch 2/100
35/35 - 0s - 3ms/step - attrition_output_accuracy: 0.7831 - department_output_accuracy: 0.6216 - loss: 1.2742
Epoch 3/100
35/35 - 0s - 2ms/step - attrition_output_accuracy: 0.8285 - department_output_accuracy: 0.6633 - loss: 1.1671
Epoch 4/100
35/35 - 0s - 2ms/step - attrition_output_accuracy: 0.8276 - department_output_accuracy: 0.6661 - loss: 1.0778
Epoch 5/100
35/35 - 0s - 2ms/step - attrition_output_accuracy: 0.8276 - department_output_accuracy: 0.6661 - loss: 1.0099
Epoch 6/100
35/35 - 0s - 2ms/step - attrition_output_accuracy: 0.8276 - department_output_accuracy: 0.6661 - loss: 0.9666
Epoch 7/100
35/35 - 0s - 2ms/step - attrition_output_accuracy: 0.8276 - department_output_accuracy: 0.6661 - loss: 0.9377
Epoch 8/100
35/35 - 0s - 2ms/step - attrition_output_accuracy: 0.8276 - department_output_accuracy: 0.6661 - loss: 0.9180
Epoch 9/100
35/35 - 0s - 2ms/step -

<keras.src.callbacks.history.History at 0x1d974c9d310>

In [29]:
# Evaluate the model with the testing data
attrition_accuracy, department_accuracy, total_loss = model.evaluate(
    X_test_scaled,
    {'department_output': y_test_encoded, 'attrition_output': y_test_encoded2},
    verbose=2
)

12/12 - 0s - 15ms/step - attrition_output_accuracy: 0.8886 - department_output_accuracy: 0.6168 - loss: 0.8321


In [30]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {department_accuracy}")
print(f"Attrition Accuracy: {attrition_accuracy}")

Department Accuracy: 0.8885869383811951
Attrition Accuracy: 0.8321433067321777


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

### Answer #1 

##### My original thoughts where the following:

> From my limited understanding, I would guess that Recall would be a better metric to use for attrition if you are trying to predict who is going to leave the company. This is because you want to minimize the number of false negatives. However, I think accuracy is a good metric to use because it gives you a general idea of how well the model is performing.

##### However, Github Copilot insisted the following:
> Accuracy is not the best metric to use on this data because the data is imbalanced. The data has more negative samples than positive samples. Therefore, accuracy would not be a good metric to use because it would be biased towards the negative samples. A better metric to use would be the F1 score because it takes into account both the precision and recall of the model.

##### I was curious to see if this was true, so I decided to test it out.

``` print(attrition_df['Attrition'].value_counts()) ```

The output was the following:

``` 
Attrition
No     1233
Yes     237
```
##### I showed ChatGPT the output and was taught the following:
> Based on the counts you provided, it's clear that your dataset is indeed imbalanced. There are 1233 instances labeled as 'No' (employees who stayed) and only 237 labeled as 'Yes' (employees who left). This significant difference shows that models trained on this data might be biased towards predicting 'No' since it represents the majority class.

> Given this imbalance, using metrics like F1 score, precision, recall, or even the ROC-AUC score will provide a more accurate assessment of your model's performance, especially in terms of its ability to correctly identify the less represented class, which is 'Yes' in this case. These metrics can help you fine-tune the model to better detect the employees at risk of leaving, which is likely your goal with this analysis.

##### For me, this was a great learning experience. I learned that accuracy is not always the best metric to use, especially when the data is imbalanced. I will keep this in mind for future projects.                     

### Answer #2

For the inputs and layers, I used Relu because i thought it was a good starting point. For the output layer, I used sigmoid because it is a binary classification problem.

### Answer #3

More data is always a good way to improve the model. If i had more time, I would have tried creating a function that would try different hyperparameters to see if I could get a better model.

In [31]:
print(attrition_df['Attrition'].value_counts())

Attrition
No     1233
Yes     237
Name: count, dtype: int64
