# Airflow Machine Learning Automation Tool Tutorial

This overview will guide you through how to orchestrate your own machine learning experiments with the airflow_ml tool.

Author: Sam Showalter

First, let's bring in general imports

## Notes

A few quick notes on the format of the configuration


In [1]:
import pandas as pd
import os
import sys
print(os.getcwd())
sys.path.append("../")

#Airflow DAG specific imports
from wmp_ml.dag.layers import DagLayer

/home/sshowalter/repos/ML-Airflow/notebooks


## Data Sources

In [2]:
import pandas as pd

data_sources = [DagLayer({'https://raw.githubusercontent.com/SamShowalter/WMP_training/01_EDA/data/IBM_Employee_Attrition.csv': 
               {pd.read_csv:None}})]


## Preprocessing

In [3]:
from sklearn.preprocessing import *
from wmp_ml.static.preprocessing import *

preprocessing = { 'cleansing': [   
                  DagLayer({'winsorize': {wmp_winsorize: {'limits' : [0.05, 0.05],
                                                          'col_names': ['MonthlyRate',
                                                                        'NumCompaniesWorked',
                                                                       'PercentSalaryHike']}}}),
                  DagLayer({'median_impute': {wmp_impute : {'method': "median"}}})]
                        
                }

## Feature Engineering

In [4]:
from sklearn.metrics import *
from wmp_ml.static.splitting import *
from wmp_ml.static.feature_engineering import *
from scipy.stats import *

feature_engineering = {
                            'transformations': DagLayer({
                                    ('Department', 'EducationField', 'JobRole', "MaritalStatus"): {pd.get_dummies: None},
                                    'BusinessTravel': {create_ordinal_df: {'ordinal_dict':{"Non-Travel" :0, 
                                                                                            "Travel_Rarely":1, 
                                                                                            "Travel_Frequently": 2},
                                                                          'tag': None}},
                                    ('Attrition', 'Gender', 'OverTime') : {convert_boolean_df: 
                                                                           {'boolean_names_and_values':
                                                                            {"Attrition": ["Yes", "No"],
                                                                            "Gender": ["Male", "Female"],
                                                                            "OverTime": ["Yes", "No"]}, 
                                                                            'tag' : None}},
                                    ('Age', 'DistanceFromHome', "EnvironmentSatisfaction",
                                    'JobInvolvement', 'JobSatisfaction', "MonthlyIncome",
                                    'NumCompaniesWorked', 'PercentSalaryHike', "PerformanceRating",
                                    'RelationshipSatisfaction', 'TotalWorkingYears',
                                    'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
                                    'YearsInCurrentRole', 'YearsSinceLastPromotion',
                                    'YearsWithCurrManager') : {normalize_values: {'tag': None}},
                                    'MonthlyRate': {wmp_linear_transformation: {'method': boxcox},
                                                    normalize_values: {'tag': None}},
                               })
                            
                           }

## Splitting

In [5]:
from sklearn.model_selection import *

splitting = DagLayer({'split': {auto_train_test_split  : {'target': "Attrition",
                                                                        'test_ratio': 0.2,
                                                                         'random_state': 42}}})

## Modeling

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

modeling = {'metrics': DagLayer({'LOG': {LogisticRegression: None}, 
                               'GNB': {GaussianNB: None},
                               'RF':{RandomForestClassifier: {'n_estimators': 10}},
                               'MLP': {MLPClassifier: {'activation': 'tanh'}},
                               'SVM': {SVC: {'kernel': 'linear'}},
                               'KNN': {KNeighborsClassifier: {'n_neighbors':3 }}})}

## Evaluation (STILL IN EXPERIMENTAL MODE)

This piece is not finished yet, but will look something like this

In [7]:
#Expects predictions
evaluation = {'metrics' : DagLayer({'acc': {accuracy_score: None}, 
                                    'f1': {f1_score: None}, 
                                    'precision': {precision_score: None},
                                    'conf_matrix': {confusion_matrix: None}, 
                                    'class_report': {classification_report: None},
                                    'recall': {recall_score :None}})}

## Consolidate all Configurations

In [8]:
from wmp_ml.static.eda import *

kwargs = \
{   
    'modeling': modeling,
 
    'data_sources': data_sources,

    'splitting': splitting,
 
    'preprocessing': preprocessing,
    
    'feature_engineering': feature_engineering1
 
    #'evaluation' : evaluation
}

# Generate as Code

In [9]:
from wmp_ml.dag.generator import DagGenerator
from datetime import datetime

In [10]:
dag_args = { 
                'dag_name': "WMP_ML_test_creation_full",
                'dag':      {
                                'owner': 'Sam Showalter',
                                'email': ['sshowalter@wmp.com'],
                                'op_args':{},
                                'op_kwargs': {},
                                'params': {}
                            },
                'config' : kwargs
}

## Run the DAG Generator with your arguments

In [11]:
dg = DagGenerator(dag_args)
dg.generate_file()

{'models': <wmp_ml.dag.layers.DagLayer object at 0x7fa60c45b278>, 'data_sources': <wmp_ml.dag.layers.DagLayer object at 0x7fa6497950f0>, 'splitting': <wmp_ml.dag.layers.DagLayer object at 0x7fa60c42d940>, 'cleansing_10': <wmp_ml.dag.layers.DagLayer object at 0x7fa616b549b0>, 'cleansing_11': <wmp_ml.dag.layers.DagLayer object at 0x7fa616b43518>, 'transformations': <wmp_ml.dag.layers.DagLayer object at 0x7fa60c45b7f0>}


IndexError: list index out of range