In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pre processing
from sklearn.preprocessing   import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree         import DecisionTreeClassifier
from sklearn.neighbors    import KNeighborsClassifier
from sklearn.svm          import SVC
from sklearn.ensemble     import RandomForestClassifier
from sklearn.ensemble     import AdaBoostClassifier

# Metrics
from sklearn import metrics

sns.set_theme(style="whitegrid")

In [None]:
turnover = pd.read_csv('../input/employee-turnover/turnover.csv', encoding = 'ISO-8859-1')

In [None]:
turnover.info()

In [None]:
turnover

In [None]:
turnover.describe()

In [None]:
turnover.isna().sum()

In [None]:
turnover.duplicated().sum()

In [None]:
# Remove os valores duplicados
turnover = turnover.drop_duplicates()

In [None]:
def hist_box_plots(data, col, bins='auto'):
    figure, axis = plt.subplots(ncols=2, figsize=(11,3))

    sns.histplot(data=data, x=col, ax=axis[0], bins=bins)
    sns.boxplot(data=data, x=col, ax=axis[1])

# Visualização dos Dados

In [None]:
figure, axis = plt.subplots(ncols=3, nrows=3, figsize=(20,10))
plt.subplots_adjust(hspace = 0.4, wspace=0.4)

g = sns.countplot(data=turnover, x='event', ax=axis[0, 0], order = turnover['event'].value_counts().index)
g.title.set_text('Event')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='gender', ax=axis[0, 1], order = turnover['gender'].value_counts().index)
g.title.set_text('Gender')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, y='industry', ax=axis[0, 2],order = turnover['industry'].value_counts().index)
g.title.set_text('Industry')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, y='profession', ax=axis[1, 0], order = turnover['profession'].value_counts().index)
g.title.set_text('Profession')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, y='traffic', ax=axis[1, 1], order = turnover['traffic'].value_counts().index)
g.title.set_text('Traffic')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='coach', ax=axis[1, 2], order = turnover['coach'].value_counts().index)
g.title.set_text('Coach')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='head_gender', ax=axis[2, 0], order = turnover['head_gender'].value_counts().index)
g.title.set_text('Head Gender')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='greywage', ax=axis[2, 1], order = turnover['greywage'].value_counts().index)
g.title.set_text('Greywage')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='way', ax=axis[2, 2], order = turnover['way'].value_counts().index)
g.title.set_text('Way')
g.set(xlabel='')
g.set(ylabel='')

plt.show()

In [None]:
for col in ['stag', 'age', 'extraversion', 'independ', 'selfcontrol', 'anxiety', 'novator']:
    hist_box_plots(turnover, col)

In [None]:
figure, axis = plt.subplots(ncols=2, nrows=3, figsize=(20,10))
plt.subplots_adjust(hspace = 0.4, wspace=0.4)

g = sns.countplot(data=turnover, x='gender', hue='event', ax=axis[0, 0], order = turnover['gender'].value_counts().index)
g.title.set_text('Gender')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='coach', hue='event', ax=axis[0, 1], order = turnover['coach'].value_counts().index)
g.title.set_text('Coach')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='head_gender', hue='event', ax=axis[1, 0], order = turnover['head_gender'].value_counts().index)
g.title.set_text('Head Gender')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='greywage', hue='event', ax=axis[1, 1], order = turnover['greywage'].value_counts().index)
g.title.set_text('Greywage')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, x='way', hue='event', ax=axis[2, 0], order = turnover['way'].value_counts().index)
g.title.set_text('Way')
g.set(xlabel='')
g.set(ylabel='')

g = sns.countplot(data=turnover, y='traffic', hue='event', ax=axis[2, 1], order = turnover['traffic'].value_counts().index)
g.title.set_text('Traffic')
g.set(xlabel='')
g.set(ylabel='')

plt.show()

In [None]:
sns.scatterplot(data=turnover, x='stag', y='age', hue='event');

In [None]:
turnover2 = turnover.copy()

In [None]:
encoder = LabelEncoder()

turnover2['gender'] = encoder.fit_transform(turnover2['gender'])
turnover2['industry'] = encoder.fit_transform(turnover2['industry'])
turnover2['profession'] = encoder.fit_transform(turnover2['profession'])
turnover2['traffic'] = encoder.fit_transform(turnover2['traffic'])
turnover2['coach'] = encoder.fit_transform(turnover2['coach'])
turnover2['head_gender'] = encoder.fit_transform(turnover2['head_gender'])
turnover2['greywage'] = encoder.fit_transform(turnover2['greywage'])
turnover2['way'] = encoder.fit_transform(turnover2['way'])

In [None]:
sns.pairplot(data=turnover[['stag', 'age', 'extraversion', 'independ', 'selfcontrol', 'anxiety', 'novator', 'event']], hue='event');

In [None]:
plt.figure(figsize=(18,7))
corr = turnover2.corr()
sns.heatmap(corr, vmin=-1, center=0, vmax=1, annot=True);

# Pré-processamento

In [None]:
turnover3 = turnover2.copy()

In [None]:
turnover3 = pd.get_dummies(turnover3, columns=['industry','profession','traffic','coach','head_gender','greywage','way', 'gender'], dtype=int)

In [None]:
turnover3

In [None]:
X = turnover3.drop('event', axis=1)
y = turnover['event']

In [None]:
scaler = StandardScaler()
# scaler = MinMaxScaler()

X = scaler.fit_transform(X)

In [None]:
# Separando o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.10,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)
print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

# Análise Preditiva

In [None]:
def evaluate_model(y_pred, y_test):
    print(f'Acc: {np.round(metrics.accuracy_score(y_test, y_pred)*100, 2)}%')
    print(f'F1: {np.round(metrics.f1_score(y_test, y_pred)*100, 2)}%')
    print(f'Recall: {np.round(metrics.recall_score(y_test, y_pred)*100, 2)}%')
    print(f'Precision: {np.round(metrics.precision_score(y_test, y_pred)*100, 2)}%')
    print('Matriz de Confusão:\n') 
    sns.heatmap(metrics.confusion_matrix(y_pred, y_test), annot=True, fmt='g');

In [None]:
# Instanciando o modelo
model = LogisticRegression()

# Treinamento
model.fit(X_train, y_train)

# Predições
y_pred = model.predict(X_test)

evaluate_model(y_pred, y_test)

In [None]:
# Instanciando o modelo
model = SVC(kernel='poly')

# Treinamento
model.fit(X_train, y_train)

# Predições
y_pred = model.predict(X_test)

evaluate_model(y_pred, y_test)

In [None]:
# Instanciando o modelo
model = DecisionTreeClassifier(criterion='entropy', class_weight={0:1, 1:3},random_state=42)

# Treinamento
model.fit(X_train, y_train)

# Predições
y_pred = model.predict(X_test)

evaluate_model(y_pred, y_test)

In [None]:
# Instanciando o modelo
model = KNeighborsClassifier(n_neighbors=2, weights='distance')

# Treinamento
model.fit(X_train, y_train)

# Predições
y_pred = model.predict(X_test)

evaluate_model(y_pred, y_test)

In [None]:
# Instanciando o modelo
model = RandomForestClassifier(
    n_estimators=100,
    criterion='entropy',
    class_weight={0:1, 1:2},
#     class_weight='balanced',
    random_state=42)

# Treinamento
model.fit(X_train, y_train)

# Predições
y_pred = model.predict(X_test)

evaluate_model(y_pred, y_test)

In [None]:
# Instanciando o modelo
model = AdaBoostClassifier(n_estimators=200, 
                           learning_rate=0.1,
                           random_state=42
                          )

# Treinamento
model.fit(X_train, y_train)

# Predições
y_pred = model.predict(X_test)

evaluate_model(y_pred, y_test)