Import Packages

In [10]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# read data into pandas:
df_joined = pd.read_csv('data/df_joined_1.csv')

In [4]:
print(df_joined.shape)
df_joined.head()

(1470, 59)


Unnamed: 0,EmployeeNumber,Age,Openness,Conscieniousness,Extroversion,Agreeableness,Emotional Balance,DailyRate,DistanceFromHome,Education,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_Yes
0,1,41,25,23,14,22,1,1102,1,2,...,0,0,0,1,0,0,0,1,1,1
1,2,49,8,18,-8,11,-19,279,8,1,...,0,0,1,0,0,0,1,0,1,0
2,4,37,27,25,5,14,-34,1373,2,2,...,0,0,0,0,0,0,0,1,1,1
3,5,33,23,2,-8,13,-31,1392,3,4,...,0,0,1,0,0,0,1,0,1,1
4,7,27,16,10,4,20,-18,591,2,1,...,0,0,0,0,0,0,1,0,1,0


In [5]:
df_joined.columns

Index(['EmployeeNumber', 'Age', 'Openness', 'Conscieniousness', 'Extroversion',
       'Agreeableness', 'Emotional Balance', 'DailyRate', 'DistanceFromHome',
       'Education', 'EmployeeCount', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition_Yes',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',


### Normalization in preparation for Modeling:

In [6]:
# I removed dummies from feature list in order to normalize:

dummies = ['Attrition_Yes',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Female', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'Over18_Y', 'OverTime_Yes']
features = [ 'Openness', 'Conscieniousness', 'Extroversion',
       'Agreeableness', 'Emotional Balance', 'Age', 'DailyRate', 'DistanceFromHome',
       'Education', 'EmployeeCount', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [50]:
# preprocess the data 

from sklearn.preprocessing import StandardScaler
# Separating out the features
x = df_joined.loc[:, features].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

In [51]:
# psychometrics and employees are subsets of x, which is the normalized values of df_joined
psychometrics = x[0:5]
employees = x[5:]

In [96]:
psychometrics[0]

array([ 0.62838029,  1.81536823,  1.50749486,  1.10430056,  2.30081423,
        0.4463504 ,  0.74252653, -1.01090934, -0.89168825,  0.        ,
       -0.66053067,  1.38313827,  0.37967213, -0.05778755,  1.15325359,
       -0.10834951,  0.72601994,  2.12513592, -1.1505541 , -0.42623002,
       -1.58417824,  0.        , -0.93201439, -0.42164246, -2.17198183,
       -2.49382042, -0.16461311, -0.0632959 , -0.67914568,  0.24583399])

Since the psychometrics are a sepearte data set from the employee data, I will model them seperately:

### Principle Component Analysis
Use PCA to reduce dimensions in order to be able to visualize the data.

#### PCA on psychometric data with five principle components

In [12]:
from sklearn.decomposition import PCA

In [94]:
pd.DataFrame(psychometrics)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.62838,1.815368,1.507495,1.104301,2.300814,0.44635,0.742527,-1.010909,-0.891688,0.0,...,-1.584178,0.0,-0.932014,-0.421642,-2.171982,-2.49382,-0.164613,-0.063296,-0.679146,0.245834
1,-1.998895,1.135922,-0.81981,-0.427135,-0.002429,1.322365,-1.297775,-0.14715,-1.868426,0.0,...,1.191438,0.0,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,-0.368715,0.806541
2,0.937471,2.087147,0.555416,-0.009471,-1.729861,0.008343,1.414363,-0.887515,-0.891688,0.0,...,-0.658973,0.0,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-0.679146,-1.155935
3,0.319289,-1.038304,-0.81981,-0.148692,-1.384374,-0.429664,1.461466,-0.764121,1.061787,0.0,...,0.266233,0.0,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,-1.155935
4,-0.76253,0.048809,0.449629,0.825858,0.112734,-1.086676,-0.524295,-0.887515,-1.868426,0.0,...,1.191438,0.0,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.058285,-0.595227


In [53]:
# create principal components 
pca = PCA(n_components= 5)
principalComponents = pca.fit_transform(psychometrics)

In [62]:
components_df = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2',
                                                                 'principal component 3','principal component 4', 
                                                                  'principal component 5'])

In [63]:
components_df

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5
0,-4.297461,-2.871651,-0.679225,1.489165,5.605688e-16
1,4.840359,-2.441515,-1.464835,-0.100657,5.605688e-16
2,-1.675824,2.345174,-1.956865,-2.660471,5.605688e-16
3,0.985767,3.541979,0.344064,2.672511,5.605688e-16
4,0.147159,-0.573987,3.756862,-1.400548,5.605688e-16


In [70]:
eig_values = pca.explained_variance_
eig_vectors = pca.components_

In [71]:
print(eig_values)

[1.14247569e+01 8.14557381e+00 5.16719965e+00 4.60242410e+00
 3.92796728e-31]


In [77]:
# print(eig_vectors)

In [78]:
pca.explained_variance_ratio_

array([3.89392455e-01, 2.77627350e-01, 1.76114781e-01, 1.56865414e-01,
       1.33877756e-32])

In [67]:
pca.singular_values_

array([6.76010559e+00, 5.70809033e+00, 4.54629504e+00, 4.29065221e+00,
       1.25346995e-15])

In [80]:
# loadings = eig_vectors * np.sqrt(eig_values)

In [89]:
eig_vectors[0]

array([-0.30075727, -0.14917678, -0.26519884, -0.1492887 , -0.1826856 ,
        0.08501505, -0.22931314,  0.09268369, -0.06446145,  0.        ,
        0.06634692, -0.21727338, -0.09735059,  0.01073582, -0.18429215,
       -0.00487349,  0.17870983, -0.32671584,  0.30818528,  0.29364565,
        0.30891311,  0.        ,  0.12812876,  0.03112147,  0.21889203,
        0.26630877,  0.11004419,  0.1443192 ,  0.05496825,  0.07784753])

In [91]:
#principle components
pc1 = eig_vectors[0]
pc2 = eig_vectors[1]
pc3 = eig_vectors[2]
pc4 = eig_vectors[3]
pc5 = eig_vectors[4]

In [92]:
# get the loadings
structure_loading_1 = pc1* np.sqrt(pc1)
str_loading_1 = pd.Series(structure_loading_1, index=features)
str_loading_1


invalid value encountered in sqrt



Openness                         NaN
Conscieniousness                 NaN
Extroversion                     NaN
Agreeableness                    NaN
Emotional Balance                NaN
Age                         0.024788
DailyRate                        NaN
DistanceFromHome            0.028217
Education                        NaN
EmployeeCount               0.000000
EnvironmentSatisfaction     0.017090
HourlyRate                       NaN
JobInvolvement                   NaN
JobLevel                    0.001112
JobSatisfaction                  NaN
MonthlyIncome                    NaN
MonthlyRate                 0.075548
NumCompaniesWorked               NaN
PercentSalaryHike           0.171087
PerformanceRating           0.159124
RelationshipSatisfaction    0.171694
StandardHours               0.000000
StockOptionLevel            0.045864
TotalWorkingYears           0.005490
TrainingTimesLastYear       0.102411
WorkLifeBalance             0.137429
YearsAtCompany              0.036505
Y

In [98]:
str_loading_1.sort_values(ascending=False)

RelationshipSatisfaction    0.171694
PercentSalaryHike           0.171087
PerformanceRating           0.159124
WorkLifeBalance             0.137429
TrainingTimesLastYear       0.102411
MonthlyRate                 0.075548
YearsInCurrentRole          0.054826
StockOptionLevel            0.045864
YearsAtCompany              0.036505
DistanceFromHome            0.028217
Age                         0.024788
YearsWithCurrManager        0.021720
EnvironmentSatisfaction     0.017090
YearsSinceLastPromotion     0.012887
TotalWorkingYears           0.005490
JobLevel                    0.001112
StandardHours               0.000000
EmployeeCount               0.000000
Openness                         NaN
Conscieniousness                 NaN
Extroversion                     NaN
Agreeableness                    NaN
Emotional Balance                NaN
DailyRate                        NaN
Education                        NaN
HourlyRate                       NaN
JobInvolvement                   NaN
J

In [99]:
index = components_df.index
index

RangeIndex(start=0, stop=5, step=1)

In [111]:
fig = go.Figure()
fig.add_trace(go.Bar(x= index, y= pca.explained_variance_ratio_))
fig.update_layout(
    title="Scree Plot for Principle Component Analysis (PCA)",
    xaxis_title="Principle Components",
    yaxis_title="Proportion of Explained Variance",
    font=dict(
        family="Garamond, monospace",
        size=18,
        color="#7f7f7f"
        )
)
fig.show()

### Clustering of Psychometric Data