<a href="https://colab.research.google.com/github/RaoulDuke337/data_analysis_portfoio/blob/main/hr_promoted_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Разведывательный анализ HR-данных (EDA)

Данные для анализа были взяты с Kaggle

Ссылка на датасет: https://www.kaggle.com/datasets/shivan118/hranalysis

**Описание данных**

В наборе данных около 50000 строк и 14 столбцов с данными, целевым показателем является продвижение сотрудников по карьерой лестнице. 

**Описание столбцов**

* employee_id – индентификатор сотрудника
* department – название департамента
* region – регион
* education – образование
* gender – пол
* recruitment_channel – канал поиска
* no_of_trainings – кол-во пройденных тестов
* age - возраст
* previous_year_rating – рейтинг сотрудника в прошлом году
* length_of_service - продолжительность работы
* KPIs_met >80% - флаг выполнения KPI > 80 %
* awards_won? - флаг наличия наград
* avg_training_score - средее кол-во баллов в тестах
* is_promoted - флаг повышения


**Цель анализа**

Найти корреляции тех или иных переменных с продвижением сотрудника, предположить зависимости. 

**Импорт библиотек**

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import phik 
from phik.report import plot_correlation_matrix
from phik import report

**Загрузка данных**

In [2]:
! gdown --id 1fMR4-AD4bs-0m-HHyRSM9zmSJUUCwtq2

Downloading...
From: https://drive.google.com/uc?id=1fMR4-AD4bs-0m-HHyRSM9zmSJUUCwtq2
To: /content/hr_employee_promotion.csv
100% 3.76M/3.76M [00:00<00:00, 181MB/s]


In [4]:
df = pd.read_csv('/content/hr_employee_promotion.csv')
df

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54803,3030,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,74592,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,13918,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,13614,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0


**Общая информация о данных и проценте пропущенных значений**

In [7]:
#типы данных
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


In [8]:
#описательная статистика
df.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [9]:
#процент NaN
df.isna().mean().sort_values(ascending=False)

previous_year_rating    0.075244
education               0.043953
employee_id             0.000000
department              0.000000
region                  0.000000
gender                  0.000000
recruitment_channel     0.000000
no_of_trainings         0.000000
age                     0.000000
length_of_service       0.000000
KPIs_met >80%           0.000000
awards_won?             0.000000
avg_training_score      0.000000
is_promoted             0.000000
dtype: float64