# Airbender Tutorial 2: Employee Attrition Analysis

This overview will guide you through how to orchestrate your own machine learning experiments with the airbender tool.

Author: Sam Showalter

First, let's bring in general imports

## Notes

A few quick notes on the format of the configuration


In [1]:
import pandas as pd
import os
import sys
print(os.getcwd())
sys.path.append("../../")

#Airflow DAG specific imports
from airbender.dag.layers import DagLayer

/home/sshowalter/repos/airbender/tutorials/attrition


## Data Sources

In [2]:
import pandas as pd

data_sources = [DagLayer({'./airbender_attrition_demo.csv': 
               {pd.read_csv:None}})]


## Preprocessing

In [3]:
from sklearn.preprocessing import *
from airbender.static.preprocessing import *

preprocessing = { 
                    'cleansing': DagLayer({'median_impute': {impute : {'method': "median"}}})        
                }

## Feature Engineering

In [4]:
from sklearn.metrics import *
from airbender.static.feature_engineering import *
from scipy.stats import *

feature_engineering = {
                            'transformations': DagLayer({
                                    ('Department', 'EducationField', 'JobRole', "MaritalStatus"): {pd.get_dummies: None},
                                    'BusinessTravel': {create_ordinal_df: {'ordinal_dict':{"Non-Travel" :0, 
                                                                                            "Travel_Rarely":1, 
                                                                                            "Travel_Frequently": 2}}},
                                    ('Attrition', 'Gender', 'OverTime') : {create_boolean_df: 
                                                                           {'boolean_names_and_values':
                                                                            {"Attrition": ["Yes", "No"],
                                                                            "Gender": ["Male", "Female"],
                                                                            "OverTime": ["Yes", "No"]}}},
                                    ('Age', 'DistanceFromHome', "EnvironmentSatisfaction",
                                    'JobInvolvement', 'JobSatisfaction', "MonthlyIncome", 
                                     "PerformanceRating",
                                    'RelationshipSatisfaction', 'TotalWorkingYears',
                                    'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
                                    'YearsInCurrentRole', 'YearsSinceLastPromotion',
                                    'YearsWithCurrManager') : {normalize_values: None},
                                
                                    ('NumCompaniesWorked', 'PercentSalaryHike'): 
                                                    {winsorize: {'limits': [0.05, 0.05]},
                                                    normalize_values: None},
                                    'MonthlyRate': {winsorize: {'limits': [0.05, 0.05]},
                                                    linear_transformation: {'method': boxcox},
                                                    normalize_values: None},
                               })
                            
                           }

## Splitting

In [5]:
from airbender.static.splitting import *

splitting = DagLayer({'split': {train_test_split  : {'target': "Attrition",
                                                                        'test_ratio': 0.2,
                                                                         'random_state': 42}}})

## Modeling

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

modeling = {'modeling': DagLayer({'LOG': {LogisticRegression: None}, 
                               'GNB': {GaussianNB: None},
                               'RF':{RandomForestClassifier: {'n_estimators': 10}},
                               'MLP': {MLPClassifier: {'activation': 'tanh'}},
                               'SVM': {SVC: {'kernel': 'linear'}},
                               'KNN': {KNeighborsClassifier: {'n_neighbors':3 }}})}

## Evaluation

This piece is not finished yet. It works, but boy it is messy and breaking some design patterns. More to come on this.

In [7]:
#Expects predictions
evaluation = {'metrics' : DagLayer({'acc': {accuracy_score: None}, 
                                    'f1': {f1_score: None}, 
                                    'precision': {precision_score: None},
                                    'conf_matrix': {confusion_matrix: None}, 
                                    'class_report': {classification_report: None},
                                    'recall': {recall_score :None}})}

## Consolidate all Configurations

In [8]:
attrition_config = \
{   
    'modeling': modeling,
 
    'data_sources': data_sources,

    'splitting': splitting,
 
    'preprocessing': preprocessing,
    
    'feature_engineering': feature_engineering,
 
    'evaluation' : evaluation
}

# Generate as Code

In [9]:
from airbender.dag.generator import DagGenerator
from datetime import datetime

In [10]:
airbender_config = { 
                        'dag_name': "Airbender_Attrition_Tutorial",
                        'dag':      {
                                        'owner': 'airbender',
                                        # 'email': [<EMAIL>, <EMAIL>, ...],
                                        # 'op_args':{},
                                        # 'op_kwargs': {},
                                    },
                        'config' : attrition_config
                   }

## Run the DAG Generator with your arguments

In [11]:
dg = DagGenerator(airbender_config)
dg.generate_file()


Displaying Ordered Dag Layers with Tags:

0 ['data_sources']
1 ['splitting']
2 ['preprocessing', 'cleansing']
3 ['feature_engineering', 'transformations']
4 ['modeling', 'modeling']
5 ['evaluation', 'metrics']

Generated airflow file with name: Airbender_Attrition_Tutorial_airbender_10-28-2019--13.38.03.py
