In [None]:
try:
    from tensorflow.python.util import module_wrapper as deprecation
except ImportError:
    from tensorflow.python.util import deprecation_wrapper as deprecation
deprecation._PER_MODULE_WARNING_LIMIT = 0

import tensorflow as tf
import pandas as pd
import numpy as np
from keras import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.compat.v1.logging.set_verbosity(False)

# load data into a dataframe
data = pd.read_csv("./data/hr-formula-analytics-in-action.csv")
data.head()
# Example data used to create test and training sets:
# Employee_ID	Employees_Left	Avg_Employees	Engaged_Employees	Recruitment_Costs	Hires	Employees_Lacking_Skills	Gender	City	Job_Title	Department	Store_Location	Business_Unit	Division	Age	Length_of_Service	Absent_Hours	Performance_Rating	Education_Level	Training_Hours	Satisfaction_Score	Retention_Rate	Average_Employee_Tenure	Absenteeism_Rate	Diversity_Index	Turnover_Rate	Engagement_Score	Cost_Per_Hire	Skills_Gap_Percentage	Representation_Rate
# 5	100	80	5000	10	20	M	NY	Manager	HR	NY	BU1	D1	45	5	10	4	Bachelor's	30	75	0.95	4.8	0.00125	0.3	0.05	0.8	500	0.2	0.003157895
# 7	120	90	6000	12	30	F	LA	Analyst	Finance	LA	BU2	D2	30	3	5	3	Master's	20	80	0.941666667	4.793103448	0.000462963	0.241666667	0.058333333	0.75	500	0.25	0.002138643
# 6	110	85	5500	11	25	M	SF	Engineer	IT	SF	BU3	D3	35	4	8	5	High School	25	90	0.945454545	4.857142857	0.000855615	0.254545455	0.054545455	0.772727273	500	0.227272727	0.002447552
# 3	105	75	4800	9	15	F	NY	Specialist	Marketing	NY	BU1	D1	40	6	12	4	Associate's	15	70	0.971428571	4.888888889	0.00152381	0.257142857	0.028571429	0.714285714	533.3333333	0.142857143	0.002521008
# 4	95	70	4500	8	18	M	LA	Developer	IT	LA	BU3	D3	28	2	6	3	PhD	40	85	0.957894737	4.846153846	0.000902256	0.273684211	0.042105263	0.736842105	562.5	0.189473684	0.003007519

feats = ['Retention_Rate', 'Average_Employee_Tenure', 'Absenteeism_Rate', 
            'Diversity_Index', 'Turnover_Rate', 'Engagement_Score', 'Cost_Per_Hire', 
            'Skills_Gap_Percentage', 'Representation_Rate']

# Encode categorical features using LabelEncoder
categorical_features = ['Gender', 'City', 'Job_Title', 'Department', 'Store_Location', 
                        'Business_Unit', 'Division', 'Education_Level']

label_encoder = LabelEncoder()

for feature in categorical_features:
    data[feature] = label_encoder.fit_transform(data[feature])

# separate categorical and numerical features
numerical_features = data.select_dtypes(exclude=["object", "category"]).columns

# apply MinMaxScaler to numerical features
scaler = MinMaxScaler()
scaled_numerical_features = scaler.fit_transform(data[numerical_features])

# combine encoded categorical and scaled numerical features
encoded_data = np.concatenate([data[categorical_features].values, scaled_numerical_features], axis=1)

# split encoded data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(encoded_data, data[feats], test_size=0.3, random_state=42)

# define three different models to evaluate
models = [
    # model 1: simple multilayer perceptron
    Sequential([
        Dense(128, kernel_initializer='uniform', activation='relu', input_dim=x_train.shape[1]),
        Dense(64, kernel_initializer='uniform', activation='relu'),
        Dense(len(feats), kernel_initializer='uniform', activation='sigmoid')
    ], name='MLP'),

    # model 2: convolutional neural network (cnn)
    Sequential([
        Dense(128, kernel_initializer='uniform', activation='relu', input_dim=x_train.shape[1]),
        Dense(64, kernel_initializer='uniform', activation='relu'),
        Dense(32, kernel_initializer='uniform', activation='relu'),
        Dense(len(feats), kernel_initializer='uniform', activation='sigmoid')
    ], name='CNN'),

    # model 3: recurrent neural network (rnn) - long short-term memory (lstm)
    Sequential([
        LSTM(128, return_sequences=True, input_shape=(x_train.shape[1], 1)),
        LSTM(64),
        Dense(len(feats), activation='sigmoid')
    ], name='LSTM')
]

# evaluate and compare the performance of each model
for model in models:
    # compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    if model.name == 'LSTM':
        # convert x_train and x_test to tensorflow tensors with float32 dtype
        x_train_tensor = tf.convert_to_tensor(x_train, dtype=tf.float32)
        y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float32)
        x_test_tensor = tf.convert_to_tensor(x_test, dtype=tf.float32)
        y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.int64)

        model.fit(x_train_tensor, y_train, batch_size=32, epochs=10, verbose=1)

        # evaluate the model on the test set
        test_loss, test_acc = model.evaluate(x_test_tensor, y_test, verbose=0)
    else:
        model.fit(x_train, y_train, batch_size=32, epochs=10, verbose=1)

        # evaluate the model on the test set
        test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
    
    # print the model's performance
    print(f"model: {model.name}")
    print(f"test loss: {test_loss:.4f}")
    print(f"test accuracy: {test_acc:.4f}")
    print("-----------------------------------")

print("3 Models Created and Tested")

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

# Load your data
#df = pd.read_csv('your_data.csv')
df = pd.read_csv("./data/hr-formula-analytics-in-action.csv")

# List of categorical columns to convert
categorical_cols = ['Gender', 'City', 'Job_Title', 'Department', 'Store_Location', 'Business_Unit', 'Division', 'Education_Level']

# Convert categorical columns to numerical
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# List of columns to scale
scale_cols = ['Employees_Left', 'Avg_Employees', 'Engaged_Employees', 'Recruitment_Costs', 'Hires', 'Employees_Lacking_Skills', 'Age', 'Length_of_Service', 'Absent_Hours', 'Performance_Rating', 'Training_Hours', 'Satisfaction_Score', 'Retention_Rate', 'Average_Employee_Tenure', 'Absenteeism_Rate', 'Diversity_Index', 'Turnover_Rate', 'Engagement_Score', 'Cost_Per_Hire', 'Skills_Gap_Percentage', 'Representation_Rate']

# Scale the columns
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Specify the number of clusters (k) you want to find
k = 3

# Perform K-Means clustering
#kmeans = KMeans(n_clusters=k)
kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=100)
labels = kmeans.fit_predict(df)
kmeans.fit(df)

# Get the cluster assignments for each data point
clusters = kmeans.predict(df)

# Add the cluster assignments to your original DataFrame
df['Cluster'] = clusters

print(df)

In [None]:
import matplotlib.pyplot as plt

# Filter rows of original data
filtered_label0 = df[labels == 0]

# Plotting the results
plt.scatter(filtered_label0.iloc[:,0], filtered_label0.iloc[:,1])
plt.show()

# Filter rows of original data
filtered_label2 = df[labels == 2]
filtered_label8 = df[labels == 8]

# Plotting the results
plt.scatter(filtered_label2.iloc[:,0], filtered_label2.iloc[:,1], color='red')
plt.scatter(filtered_label8.iloc[:,0], filtered_label8.iloc[:,1], color='black')
plt.show()