#### Importando as bibliotecas

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

#### Carregando a base de dados.

In [73]:
path_data = "hr_employee_churn_data.csv"

In [74]:
df = pd.read_csv(path_data)

In [52]:
df.head()

Unnamed: 0,empid,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,1,0.38,0.53,2,157,3,0,0,low,1
1,2,0.8,0.86,5,262,6,0,0,medium,1
2,3,0.11,0.88,7,272,4,0,0,medium,1
3,4,0.72,0.87,5,223,5,0,0,low,1
4,5,0.37,0.52,2,159,3,0,0,low,1


#### Verificando a forma dos dados.

In [4]:
df.shape

(14999, 10)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   empid                  14999 non-null  int64  
 1   satisfaction_level     14997 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB


#### Drop features irrelevantes.

In [54]:
df.drop(['empid'], axis=1, inplace=True)

In [55]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


#### Manipulando missing values.

In [56]:
df.isnull().sum()

satisfaction_level       2
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

#### Preechendo registros com o valor da média.

In [57]:
df['satisfaction_level'].fillna(df['satisfaction_level'].mean(), inplace=True)

In [58]:
df.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

#### Manipulando variáveis categóricas.

In [60]:
df['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [61]:
salary_dummies = pd.get_dummies(df['salary'],drop_first=True)

In [62]:
salary_dummies

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0
...,...,...
14994,1,0
14995,1,0
14996,1,0
14997,1,0


In [63]:
df = pd.concat([df, salary_dummies],axis=1)

In [65]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left,low,medium
0,0.38,0.53,2,157,3,0,0,low,1,1,0
1,0.8,0.86,5,262,6,0,0,medium,1,0,1
2,0.11,0.88,7,272,4,0,0,medium,1,0,1
3,0.72,0.87,5,223,5,0,0,low,1,1,0
4,0.37,0.52,2,159,3,0,0,low,1,1,0


#### Removendo a feature salary.

In [66]:
df.drop(['salary'], axis=1, inplace=True)

In [67]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,low,medium
0,0.38,0.53,2,157,3,0,0,1,1,0
1,0.8,0.86,5,262,6,0,0,1,0,1
2,0.11,0.88,7,272,4,0,0,1,0,1
3,0.72,0.87,5,223,5,0,0,1,1,0
4,0.37,0.52,2,159,3,0,0,1,1,0


#### Separando os conjuntos de dados em treino e teste.

In [69]:
X = df.drop(labels='left',axis=1)
y = df['left']

In [70]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [71]:
len(X_train)

11999

In [23]:
len(X_test)

3000

#### Selecionando o modelo.

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [25]:
#create param
model_param = {
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators': [10, 50, 100, 130], 
            'criterion': ['gini', 'entropy'],
            'max_depth': range(2, 4, 1),
        }
    },
    'SVMClassifier':{
        'model': SVC(),
        'param':{
            'C': [1.0, 1.5, 2],
            'gamma': ['scale', 'auto']
        }
    }
}

In [26]:
scores =[]
for model_name, mp in model_param.items():
    model_selection = GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(X,y)
    scores.append({
        'model': model_name,
        'best_score': model_selection.best_score_,
        'best_params': model_selection.best_params_
    })
    

### Verificando os scores e o melhor estimator

In [27]:
scores

[{'model': 'RandomForestClassifier',
  'best_score': 0.925530799155274,
  'best_params': {'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 10}},
 {'model': 'SVMClassifier',
  'best_score': 0.9598643992441925,
  'best_params': {'C': 2, 'gamma': 'auto'}}]

In [34]:
model_selection.best_estimator_

SVC(C=2, gamma='auto')

#### Construindo o modelo com o melhor estimator

In [41]:
model = model_selection.best_estimator_

In [43]:
model.fit(X,y)

SVC(C=2, gamma='auto')

#### Persistindo os artefatos

In [45]:
import pickle,json
pickle.dump(model,open('model.pkl','wb'))

In [47]:
scores[0]

{'model': 'RandomForestClassifier',
 'best_score': 0.925530799155274,
 'best_params': {'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 10}}

In [49]:
with open('scores.json', 'w') as f:
    json.dump(scores[0], f)

In [50]:
!ls

50_Startups.csv
End to End ML Project 1 - P1 - Problem Statement and Solution Design.pdf
End to End ML Project 1 - P2 - Exploratory Data Analysis .ipynb
End to End ML Project 1 - P3 - Model Selection and Building.ipynb
End to End ML Project 1 - P6 - Deployment in AWS EC2 with NGINX, Guinicorn, Supervisor.pdf
End to End ML Project 2 - P1 - Exploratory Data Analysis.ipynb
End to End ML Project 2 - P1 - Problem Statement, Solution Design.pdf
End to End ML Project 2 - P2 - Feature Engineering and Model Building.ipynb
[34mEndtoEndML_v11[m[m
[34mEndtoEnd_Project2_v1[m[m
README.md
columns.json
hr_employee_churn_data.csv
model.pkl
profit_prediction_model.pkl
scores.json
