## Part 1: Preprocessing

In [20]:
# Import our dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')

# Check for missing values and drop if any
attrition_df = attrition_df.dropna()
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [4]:
# Create a list of at least 10 column names to use as X data
x_columns = ['Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
             'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'MaritalStatus', 'NumCompaniesWorked',
             'OverTime']

# Create x_df using your selected columns
x_df = attrition_df[x_columns]

# Show the data types for x_df
print(x_df.dtypes)


Age                         int64
BusinessTravel             object
DistanceFromHome            int64
Education                   int64
EnvironmentSatisfaction     int64
HourlyRate                  int64
JobInvolvement              int64
JobLevel                    int64
JobRole                    object
MaritalStatus              object
NumCompaniesWorked          int64
OverTime                   object
dtype: object


In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


x_train shape: (1176, 12)
x_test shape: (294, 12)
y_train shape: (1176, 2)
y_test shape: (294, 2)


In [6]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Identify non-numeric columns
non_numeric_columns = x_df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# Convert categorical columns to numeric using one-hot encoding
x_df_numeric = pd.get_dummies(x_df, columns=non_numeric_columns)

# Ensure all columns are now numeric
print("Data types of x_df_numeric:\n", x_df_numeric.dtypes)
print("First few rows of x_df_numeric:\n", x_df_numeric.head())


Non-numeric columns: Index(['BusinessTravel', 'JobRole', 'MaritalStatus', 'OverTime'], dtype='object')
Data types of x_df_numeric:
 Age                                  int64
DistanceFromHome                     int64
Education                            int64
EnvironmentSatisfaction              int64
HourlyRate                           int64
JobInvolvement                       int64
JobLevel                             int64
NumCompaniesWorked                   int64
BusinessTravel_Non-Travel             bool
BusinessTravel_Travel_Frequently      bool
BusinessTravel_Travel_Rarely          bool
JobRole_Healthcare Representative     bool
JobRole_Human Resources               bool
JobRole_Laboratory Technician         bool
JobRole_Manager                       bool
JobRole_Manufacturing Director        bool
JobRole_Research Director             bool
JobRole_Research Scientist            bool
JobRole_Sales Executive               bool
JobRole_Sales Representative          bool
MaritalS

In [7]:
# Convert categorical columns to numeric using one-hot encoding
x_df_numeric = pd.get_dummies(x_df, columns=non_numeric_columns)

# Split the data into training and testing sets again with the numeric data
x_train, x_test, y_train, y_test = train_test_split(x_df_numeric, y_df, test_size=0.2, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(x_train)

# Scale the training and testing data
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Optionally, you can convert the scaled data back to DataFrames
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_test_scaled_df = pd.DataFrame(x_test_scaled, columns=x_test.columns)

# Display the first few rows of the scaled training data
print("First few rows of the scaled training data:\n", x_train_scaled_df.head())



First few rows of the scaled training data:
         Age  DistanceFromHome  Education  EnvironmentSatisfaction  HourlyRate  \
0 -1.388559          1.440396  -0.863356                 0.279706   -0.472832   
1 -2.040738         -0.522699  -0.863356                -0.639104    0.309374   
2 -0.845077          1.317703  -0.863356                 1.198515   -1.059487   
3  0.241886          0.336155   0.099933                 1.198515   -0.032841   
4 -0.627685          1.317703   0.099933                -0.639104    1.091580   

   JobInvolvement  JobLevel  NumCompaniesWorked  BusinessTravel_Non-Travel  \
0       -1.012340 -0.932274           -1.059168                  -0.326041   
1        0.389912 -0.932274           -0.659431                   3.067096   
2        0.389912 -0.025447           -0.259693                  -0.326041   
3        0.389912 -0.025447            0.539781                  -0.326041   
4        0.389912 -0.025447           -0.659431                  -0.326041   


In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(drop='first', sparse=False)

# Fit the encoder to the Department column in the training data
department_encoder.fit(y_train[['Department']])

# Create two new variables by applying the encoder to the training and testing data
y_train_department_encoded = department_encoder.transform(y_train[['Department']])
y_test_department_encoded = department_encoder.transform(y_test[['Department']])

# Convert the encoded columns to DataFrames and add them back to y_train and y_test
y_train_encoded_df = pd.DataFrame(y_train_department_encoded, columns=department_encoder.get_feature_names_out(['Department']))
y_test_encoded_df = pd.DataFrame(y_test_department_encoded, columns=department_encoder.get_feature_names_out(['Department']))

# Drop the original Department column from y_train and y_test and concatenate the encoded DataFrames
y_train = pd.concat([y_train.drop(columns=['Department']), y_train_encoded_df], axis=1)
y_test = pd.concat([y_test.drop(columns=['Department']), y_test_encoded_df], axis=1)

# Display the first few rows of y_train and y_test
print("First few rows of y_train:\n", y_train.head())
print("First few rows of y_test:\n", y_test.head())


First few rows of y_train:
      Attrition  Department_Research & Development  Department_Sales
1097        No                                1.0               0.0
727         No                                0.0               1.0
254         No                                1.0               0.0
1175        No                                0.0               1.0
1341        No                                NaN               NaN
First few rows of y_test:
      Attrition  Department_Research & Development  Department_Sales
1041        No                                NaN               NaN
184         No                                0.0               1.0
1222       Yes                                NaN               NaN
67          No                                1.0               0.0
220         No                                1.0               0.0




In [9]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(drop='first', sparse=False)

# Fit the encoder to the Attrition column in the training data
attrition_encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder to the training and testing data
y_train_attrition_encoded = attrition_encoder.transform(y_train[['Attrition']])
y_test_attrition_encoded = attrition_encoder.transform(y_test[['Attrition']])

# Convert the encoded columns to DataFrames and add them back to y_train and y_test
y_train_encoded_df = pd.DataFrame(y_train_attrition_encoded, columns=attrition_encoder.get_feature_names_out(['Attrition']))
y_test_encoded_df = pd.DataFrame(y_test_attrition_encoded, columns=attrition_encoder.get_feature_names_out(['Attrition']))

# Drop the original Attrition column from y_train and y_test and concatenate the encoded DataFrames
y_train = pd.concat([y_train.drop(columns=['Attrition']), y_train_encoded_df], axis=1)
y_test = pd.concat([y_test.drop(columns=['Attrition']), y_test_encoded_df], axis=1)

# Display the first few rows of y_train and y_test
print("First few rows of y_train:\n", y_train.head())
print("First few rows of y_test:\n", y_test.head())

First few rows of y_train:
       Department_Research & Development  Department_Sales  Attrition_Yes  \
1097                                1.0               0.0            0.0   
727                                 0.0               1.0            0.0   
254                                 1.0               0.0            0.0   
1175                                0.0               1.0            0.0   
1341                                NaN               NaN            0.0   

      Attrition_nan  
1097            0.0  
727             0.0  
254             0.0  
1175            0.0  
1341            1.0  
First few rows of y_test:
       Department_Research & Development  Department_Sales  Attrition_Yes  \
1041                                NaN               NaN            NaN   
184                                 0.0               1.0            0.0   
1222                                NaN               NaN            NaN   
67                                  1.0             



## Create, Compile, and Train the Model

In [10]:
from tensorflow.keras import layers, Input, Model
# Find the number of columns in the X training data
num_features = x_train.shape[1]
print("Number of features in X training data:", num_features)

# Create the input layer
input_layer = Input(shape=(num_features,))

# Create at least two shared layers
shared_layer_1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer_2 = layers.Dense(32, activation='relu')(shared_layer_1)

# Output
output_layer = layers.Dense(1, activation='sigmoid')(shared_layer_2)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Display the model summary
model.summary()


Number of features in X training data: 25
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 25)]              0         
                                                                 
 dense (Dense)               (None, 64)                1664      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3777 (14.75 KB)
Trainable params: 3777 (14.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# Create a branch for Department with a hidden layer and an output layer

# Create the hidden layer for Department branch
department_hidden_layer = layers.Dense(16, activation='relu')(shared_layer_2)

# Create the output layer for Department branch
department_output_layer = layers.Dense(3, activation='softmax', name='department_output')(department_hidden_layer)

# Update the model to include the new branch
model = Model(inputs=input_layer, outputs=[output_layer, department_output_layer])

# Display the model summary
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25)]                 0         []                            
                                                                                                  
 dense (Dense)               (None, 64)                   1664      ['input_1[0][0]']             
                                                                                                  
 dense_1 (Dense)             (None, 32)                   2080      ['dense[0][0]']               
                                                                                                  
 dense_3 (Dense)             (None, 16)                   528       ['dense_1[0][0]']             
                                                                                            

In [12]:
# Create a branch for Attrition with a hidden layer and an output layer

# Create the hidden layer for Attrition branch
attrition_hidden_layer = layers.Dense(16, activation='relu')(shared_layer_2)

# Create the output layer for Attrition branch
attrition_output_layer = layers.Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden_layer)

# Update the model to include the new branch
model = Model(inputs=input_layer, outputs=[department_output_layer, attrition_output_layer])

# Display the model summary
model.summary()



Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25)]                 0         []                            
                                                                                                  
 dense (Dense)               (None, 64)                   1664      ['input_1[0][0]']             
                                                                                                  
 dense_1 (Dense)             (None, 32)                   2080      ['dense[0][0]']               
                                                                                                  
 dense_3 (Dense)             (None, 16)                   528       ['dense_1[0][0]']             
                                                                                            

In [13]:
from tensorflow.keras import Model

# Create the model
model = Model(inputs=input_layer, outputs=[department_output_layer, attrition_output_layer])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25)]                 0         []                            
                                                                                                  
 dense (Dense)               (None, 64)                   1664      ['input_1[0][0]']             
                                                                                                  
 dense_1 (Dense)             (None, 32)                   2080      ['dense[0][0]']               
                                                                                                  
 dense_3 (Dense)             (None, 16)                   528       ['dense_1[0][0]']             
                                                                                            

In [19]:
# Check to make sure 'Attrition' column is now binary encoded correctly
print("First few rows of y_train:\n", y_train.head())
print("First few rows of y_test:\n", y_test.head())
# Train the model
history = model.fit(
    x_train_scaled,
    {
        'department_output': y_train_encoded_df.values,
        'attrition_output': y_train['Attrition']
    },
    validation_data=(
        x_test_scaled,
        {
            'department_output': y_test_encoded_df.values,
            'attrition_output': y_test['Attrition']
        }
    ),
    epochs=100,
    batch_size=32
)

# Display the training history keys
print(history.history.keys())


First few rows of y_train:
       Department_Research & Development  Department_Sales  Attrition_Yes  \
1097                                1.0               0.0            0.0   
727                                 0.0               1.0            0.0   
254                                 1.0               0.0            0.0   
1175                                0.0               1.0            0.0   
1341                                NaN               NaN            0.0   

      Attrition_nan  
1097            0.0  
727             0.0  
254             0.0  
1175            0.0  
1341            1.0  
First few rows of y_test:
       Department_Research & Development  Department_Sales  Attrition_Yes  \
1041                                NaN               NaN            NaN   
184                                 0.0               1.0            0.0   
1222                                NaN               NaN            NaN   
67                                  1.0             

KeyError: 'Attrition'

In [15]:
# Evaluate the model with the testing data


In [16]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1.
2.
3.