In [1]:
import numpy as np              # Массивы (матрицы, векторы, линейная алгебра)
import matplotlib.pyplot as plt # Научная графика
%matplotlib inline 
    # Говорим jupyter'у, чтобы весь графический вывод был в браузере, а не в отдельном окне
import pandas as pd             # Таблицы и временные ряды (dataframe, series)
import seaborn as sns           # Еще больше красивой графики для визуализации данных
import sklearn                  # Алгоритмы машинного обучения

# Задача о прожвижении сотрудника по службе

## Основная информация
В этом наборе данных представлены сведения о работниках некоторой компании. Для каждого сотрудника указано:

* id - его идентификатор
* department - отдел
* region - регион
* education - ступень образования
* gender - пол
* recruitment_channel - канал найма
* no_of_trainings - количество пройденных тренингов
* age - возраст
* previous_year rating - рейтинг сотрудника за прошлый год
* length_of_service - стаж работы
* KPIs_met > 80 % - ключевые показатели эффективности выше 80 % ?
* awards_won ? - выигрывал ли награды?
* avg_training_score - средний балл на тренингах
* is_promoted - был ли повышен в должности?


Некоторое колличество человек каждый год получают продвижение по службе (примерно 5%). 
Задача заключается в определении достоин ли сотрудник получить повышение или нет на основе доступных данных. 

## Загружаем данные

In [2]:
url = "https://raw.githubusercontent.com/NikolaySokolov152/MachineLearning/master/dataset/train.csv"

data_raw = pd.read_csv(url)

## Обработка пропущенных значений (или убедиться, что их нет)

In [3]:
data_raw.isna().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [4]:
data_raw.fillna(data_raw.median(axis = 0), axis=0 , inplace=True)
data_raw['education'].fillna(data_raw['education'].mode().iloc[0], inplace=True)
data_raw.isna().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [6]:
data_raw.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.304481,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.21477,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


## Обработка категориальных признаков

In [7]:
data_raw.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')

In [8]:
data_raw.describe(include=['object'])

Unnamed: 0,department,region,education,gender,recruitment_channel
count,54808,54808,54808,54808,54808
unique,9,34,3,2,3
top,Sales & Marketing,region_2,Bachelor's,m,other
freq,16840,12343,39078,38496,30446


In [9]:
data_raw['department'] = data_raw['department'].astype('category')
data_raw['region'] = data_raw['region'].astype('category')
data_raw['education'] = data_raw['education'].astype('category')
data_raw['gender'] = data_raw['gender'].astype('category')
data_raw['recruitment_channel'] = data_raw['recruitment_channel'].astype('category')

In [10]:
data_raw['gender'] = data_raw['gender'].cat.codes
Department_dummies = pd.get_dummies(data_raw['department'], prefix = 'department')
Education_dummies = pd.get_dummies(data_raw['education'], prefix = 'education')
Recruitment_channel_dummies = pd.get_dummies(data_raw['recruitment_channel'], prefix = 'recruitment_channel')
Region_dummies = pd.get_dummies(data_raw['region'])

In [11]:
data_raw = pd.concat((data_raw, Department_dummies), axis=1)
data_raw = data_raw.drop(['department'], axis=1)

data_raw = pd.concat((data_raw, Education_dummies), axis=1)
data_raw = data_raw.drop(['education'], axis=1)

data_raw = pd.concat((data_raw, Recruitment_channel_dummies), axis=1)
data_raw = data_raw.drop(['recruitment_channel'], axis=1)

data_raw = pd.concat((data_raw, Region_dummies), axis=1)
data_raw = data_raw.drop(['region'], axis=1)

In [12]:
data_raw.head()

Unnamed: 0,employee_id,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,...,region_31,region_32,region_33,region_34,region_4,region_5,region_6,region_7,region_8,region_9
0,65438,0,1,35,5.0,8,1,0,49,0,...,0,0,0,0,0,0,0,1,0,0
1,65141,1,1,30,5.0,4,0,0,60,0,...,0,0,0,0,0,0,0,0,0,0
2,7513,1,1,34,3.0,7,0,0,50,0,...,0,0,0,0,0,0,0,0,0,0
3,2542,1,2,39,1.0,10,0,0,50,0,...,0,0,0,0,0,0,0,0,0,0
4,48945,1,1,45,3.0,2,0,0,73,0,...,0,0,0,0,0,0,0,0,0,0


## Нормализация

In [13]:
data_stand = (data_raw - data_raw.min(axis = 0))/(data_raw.max(axis = 0)-data_raw.min(axis = 0))
data_stand.describe()

Unnamed: 0,employee_id,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,...,region_31,region_32,region_33,region_34,region_4,region_5,region_6,region_7,region_8,region_9
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,...,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,0.500592,0.702379,0.028112,0.370098,0.57612,0.135153,0.351974,0.023172,0.406446,0.08517,...,0.035305,0.017242,0.004908,0.005328,0.031072,0.013976,0.012589,0.088363,0.011951,0.007663
std,0.288473,0.457216,0.067696,0.191504,0.303692,0.118475,0.47759,0.15045,0.222859,0.279137,...,0.184551,0.130173,0.069886,0.072797,0.173514,0.117392,0.111495,0.283825,0.108666,0.087204
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.251207,0.0,0.0,0.225,0.5,0.055556,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.500971,1.0,0.0,0.325,0.5,0.111111,0.0,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.750086,1.0,0.0,0.475,0.75,0.166667,1.0,0.0,0.616667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
data_stand.head()

Unnamed: 0,employee_id,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,...,region_31,region_32,region_33,region_34,region_4,region_5,region_6,region_7,region_8,region_9
0,0.835754,0.0,0.0,0.375,1.0,0.194444,1.0,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.83196,1.0,0.0,0.25,1.0,0.083333,0.0,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.095942,1.0,0.0,0.35,0.5,0.166667,0.0,0.0,0.183333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.032453,1.0,0.111111,0.475,0.0,0.25,0.0,0.0,0.183333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625107,1.0,0.0,0.625,0.5,0.027778,0.0,0.0,0.566667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Подготовка тестовой и тренировочной выборок

In [15]:
data_stand.columns

Index(['employee_id', 'gender', 'no_of_trainings', 'age',
       'previous_year_rating', 'length_of_service', 'KPIs_met >80%',
       'awards_won?', 'avg_training_score', 'is_promoted',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'education_Bachelor's',
       'education_Below Secondary', 'education_Master's & above',
       'recruitment_channel_other', 'recruitment_channel_referred',
       'recruitment_channel_sourcing', 'region_1', 'region_10', 'region_11',
       'region_12', 'region_13', 'region_14', 'region_15', 'region_16',
       'region_17', 'region_18', 'region_19', 'region_2', 'region_20',
       'region_21', 'region_22', 'region_23', 'region_24', 'region_25',
       'region_26', 'region_27', 'region_28', 'region_29', 'region_3',
       'region_30', 'region_31', 'region_32', 'region_33

In [16]:
data_stand.isna().sum()

employee_id                     0
gender                          0
no_of_trainings                 0
age                             0
previous_year_rating            0
length_of_service               0
KPIs_met >80%                   0
awards_won?                     0
avg_training_score              0
is_promoted                     0
department_Analytics            0
department_Finance              0
department_HR                   0
department_Legal                0
department_Operations           0
department_Procurement          0
department_R&D                  0
department_Sales & Marketing    0
department_Technology           0
education_Bachelor's            0
education_Below Secondary       0
education_Master's & above      0
recruitment_channel_other       0
recruitment_channel_referred    0
recruitment_channel_sourcing    0
region_1                        0
region_10                       0
region_11                       0
region_12                       0
region_13     

In [17]:
data_stand.head()

Unnamed: 0,employee_id,gender,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,...,region_31,region_32,region_33,region_34,region_4,region_5,region_6,region_7,region_8,region_9
0,0.835754,0.0,0.0,0.375,1.0,0.194444,1.0,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.83196,1.0,0.0,0.25,1.0,0.083333,0.0,0.0,0.35,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.095942,1.0,0.0,0.35,0.5,0.166667,0.0,0.0,0.183333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.032453,1.0,0.111111,0.475,0.0,0.25,0.0,0.0,0.183333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625107,1.0,0.0,0.625,0.5,0.027778,0.0,0.0,0.566667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Уберу неинформативный признак id

In [18]:
emp_id = data_stand['employee_id']
data_stand = data_stand.drop(['employee_id'], axis = 1)

In [19]:
data_stand.columns

Index(['gender', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted', 'department_Analytics',
       'department_Finance', 'department_HR', 'department_Legal',
       'department_Operations', 'department_Procurement', 'department_R&D',
       'department_Sales & Marketing', 'department_Technology',
       'education_Bachelor's', 'education_Below Secondary',
       'education_Master's & above', 'recruitment_channel_other',
       'recruitment_channel_referred', 'recruitment_channel_sourcing',
       'region_1', 'region_10', 'region_11', 'region_12', 'region_13',
       'region_14', 'region_15', 'region_16', 'region_17', 'region_18',
       'region_19', 'region_2', 'region_20', 'region_21', 'region_22',
       'region_23', 'region_24', 'region_25', 'region_26', 'region_27',
       'region_28', 'region_29', 'region_3', 'region_30', 'region_31',
       'region_32', 'region_33', 'region_34',

Разделение выборки на Х и У

In [20]:
Y = data_stand[['is_promoted']]
X = data_stand.drop(['is_promoted'], axis=1)
print("x:", X.shape)
print("y:", Y.shape)

x: (54808, 57)
y: (54808, 1)


### Разделение выборки на тренировочную и тестовую

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 21)
print("x_train: ", x_train.shape)
print("x_test ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

x_train:  (43846, 57)
x_test  (10962, 57)
y_train:  (43846, 1)
y_test:  (10962, 1)


## Запуск нейросети на 2 внутренних узлах 