## Part 1: Preprocessing

In [105]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import random
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [106]:
# Determine the number of unique values in each column.
attrition_df.nunique()


Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [107]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]

print(y_df)


     Attrition              Department
0          Yes                   Sales
1           No  Research & Development
2          Yes  Research & Development
3           No  Research & Development
4           No  Research & Development
...        ...                     ...
1465        No  Research & Development
1466        No  Research & Development
1467        No  Research & Development
1468        No                   Sales
1469        No  Research & Development

[1470 rows x 2 columns]


In [108]:
y_df.dtypes


Unnamed: 0,0
Attrition,object
Department,object


In [109]:
# Create a list of at least 10 column names to use as X data
# Columns to exclude
exclude_columns = ['Attrition', 'Department']

# Get the list of columns excluding the ones we don't want
available_columns = [col for col in attrition_df.columns if col not in exclude_columns]

# Select 10 random columns
random_columns = random.sample(available_columns, 10)

# Create X_df using your selected columns

X_df=attrition_df[random_columns]

# Show the data types for X_df

X_df.dtypes

Unnamed: 0,0
PerformanceRating,int64
JobSatisfaction,int64
Education,int64
JobRole,object
Age,int64
HourlyRate,int64
BusinessTravel,object
EnvironmentSatisfaction,int64
WorkLifeBalance,int64
DistanceFromHome,int64


In [110]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)



In [111]:
# Convert your X data to numeric data types however you see fit
# Convert columns to numeric data types
X_train_scaled = X_train.apply(pd.to_numeric, errors='coerce')

X_test_scaled = X_test.apply(pd.to_numeric, errors='coerce')




In [112]:
X_train_scaled.dtypes


Unnamed: 0,0
PerformanceRating,int64
JobSatisfaction,int64
Education,int64
JobRole,float64
Age,int64
HourlyRate,int64
BusinessTravel,float64
EnvironmentSatisfaction,int64
WorkLifeBalance,int64
DistanceFromHome,int64


In [113]:
X_test_scaled.dtypes


Unnamed: 0,0
PerformanceRating,int64
JobSatisfaction,int64
Education,int64
JobRole,float64
Age,int64
HourlyRate,int64
BusinessTravel,float64
EnvironmentSatisfaction,int64
WorkLifeBalance,int64
DistanceFromHome,int64


In [114]:

#Create a StandardScaler
# Fit the StandardScaler to the training data
# Scale the training and testing data
# Now you can proceed with scaling
X_scaler = StandardScaler()
X_scaler.fit(X_train_scaled)
X_train_scaled = X_scaler.transform(X_train_scaled)
X_test_scaled = X_scaler.transform(X_test_scaled)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [115]:
## Create a OneHotEncoder for the Department column
enc = OneHotEncoder(sparse_output=False)  # or sparse=False for older versions

# Fit the encoder to the training data
encode_ytrain_df = pd.DataFrame(enc.fit_transform(y_train[['Department']]))

#Create two new variables by applying the encoder
# to the training and testing data

encode_ytest_df = pd.DataFrame(enc.transform(y_test[['Department']]))
encode_ytest_df


Unnamed: 0,0,1,2
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
363,0.0,1.0,0.0
364,0.0,1.0,0.0
365,0.0,1.0,0.0
366,0.0,1.0,0.0


In [116]:
## Create a OneHotEncoder for the Attrition column
enc2 = OneHotEncoder(sparse_output=False)  # or sparse=False for older versions

# Fit the encoder to the training data
encode_ytrain2_df = pd.DataFrame(enc2.fit_transform(y_train[['Attrition']]))

#Create two new variables by applying the encoder
# to the training and testing data

encode_ytest2_df = pd.DataFrame(enc2.transform(y_test[['Attrition']]))
encode_ytest2_df

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
...,...,...
363,1.0,0.0
364,1.0,0.0
365,1.0,0.0
366,1.0,0.0


## Create, Compile, and Train the Model

In [99]:

# Find the number of columns in the X training data
X_train.shape[1]

# Create the input layer
number_input_features = X_train.shape[1]


# Create at least two shared layers
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="softmax"))

# Check the structure of the model
nn.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [89]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
# Department branch
department_hidden = tf.keras.layers.Dense(units=6, activation='relu', name='department_hidden')(shared_layer2)  # Hidden layer for Department


# Create the output layer
department_output = tf.keras.layers.Dense(units=3, activation='softmax', name='department_output')(department_hidden) # Output layer for Department (assuming 3 departments)

# Update the model to include the new branch
model = tf.keras.models.Model(inputs=input_layer, outputs=[branch1_output, branch2_output, department_output])


In [90]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = tf.keras.layers.Dense(units=6, activation='relu', name='attrition_hidden')(shared_layer2)

# Create the output layer
attrition_output = tf.keras.layers.Dense(units=3, activation='softmax', name='attrition_output')(department_hidden)


# Update the model to include the new branch
model = tf.keras.models.Model(inputs=input_layer, outputs=[branch1_output, branch2_output, attrition_output])


In [132]:
# Create the model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=10, activation="tanh", input_dim=len(X_train.columns)))
nn_model.add(tf.keras.layers.Dense(units=64, activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=128, activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=32, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=32, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=3, activation="sigmoid"))
nn_model.add(tf.keras.layers.Dense(units=2, activation="sigmoid"))
# Compile the model
nn_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# Summarize the model
nn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [133]:
#train the model
#fit_model = nn_model.fit(X_train_scaled,encode_ytrain2_df,epochs=100)

nn_model.fit(X_train_scaled,encode_ytrain2_df,epochs=100, verbose=1)

Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8486 - loss: nan
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8285 - loss: nan
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8138 - loss: nan
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8318 - loss: nan
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8216 - loss: nan
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8070 - loss: nan
Epoch 7/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8266 - loss: nan
Epoch 8/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8311 - loss: nan
Epoch 9/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

<keras.src.callbacks.history.History at 0x7b1b8effa560>

In [126]:
# Evaluate the model with the testing data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,encode_ytest2_df,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

12/12 - 0s - 16ms/step - accuracy: 0.8723 - loss: nan
Loss: nan, Accuracy: 0.8722826242446899


In [131]:
# Print the accuracy for both department and attrition
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,encode_ytest_df,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

12/12 - 0s - 28ms/step - accuracy: 0.0516 - loss: nan
Loss: nan, Accuracy: 0.05163043364882469


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1.No, because of Class Imbalance: If your target variable (e.g., attrition) has a significantly uneven distribution of classes (e.g., many more employees staying than leaving), accuracy can be misleading. A model might achieve high accuracy by simply predicting the majority class most of the time, without effectively identifying the minority class (which is often the class of interest).

Why: In some cases, misclassifying one class might be more costly than misclassifying another. For example, failing to identify an employee likely to leave (false negative) might be more detrimental than incorrectly predicting an employee will leave when they actually stay (false positive).

2. I chose tanh, relu and sigmoid. I chose these at random from the possible choices to make it more realistic.

3. 1. Feature Engineering and Selection, 2. Hyperparameter Tuning: ie. Try different values for hyperparameters like the number of layers, number of units per layer, activation functions, learning rate, batch size, and regularization techniques, 3. Data Augmentation and Resampling, or 4. Data Preprocessing:ie. Scaling and Normalization or Handling Missing Values.