In [1]:
'''
dictionary = something belongs to something else, is kwargs, or has certain order
list = collection of several parallel operations

blocks -> first level keys (THESE HAVE SET ORDER)
Below this ^^^ -> The order you provide the code will matter
    - dict = matters
    - list = does not matter

'''
import sys
import os
#print(os.getcwd())
sys.path.append("../")
from sklearn.preprocessing import *
from sklearn.model_selection import *
from scipy.stats import *
import pandas as pd
from sklearn.metrics import *
from ml_airflow.static.preprocessing import *
from ml_airflow.static.splitting import *
from ml_airflow.static.feature_engineering import *
from ml_airflow.static.eda import *
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from ml_airflow.dag.layers import DagLayer

kwargs = \
{   
     'modeling': { 
                    'models': DagLayer({'LOG': {LogisticRegression: None}, 
                                       'GNB': {GaussianNB: None},
                                       'RF':{RandomForestClassifier: {'n_estimators': 10}},
                                       'MLP': {MLPClassifier: {'activation': 'tanh'}},
                                       'SVM': {SVC: {'kernel': 'linear'}},
                                       'KNN': {KNeighborsClassifier: {'n_neighbors':3 }}}),              
                 },
 
     'data_sources': [DagLayer({'https://raw.githubusercontent.com/SamShowalter/WMP_training/01_EDA/data/IBM_Employee_Attrition.csv': {pd.read_csv:None}})],

#Expects flat dataset
    'splitting': [
                        DagLayer({'split': {mla_train_test_split  : {'target': "Attrition",
                                                                        'test_ratio': 0.2,
                                                                         'random_state': 42}}}),
                 ],
 
     
     #Expects flat dataset
        'preprocessing': { 
                        
                        'cleansing': [   
                                              DagLayer({'winsorize': {mla_winsorize: {'limits' : [0.05, 0.05],
                                                                                      'col_names': ['MonthlyRate',
                                                                                                    'NumCompaniesWorked',
                                                                                                   'PercentSalaryHike']}}}),
                                              DagLayer({'median_impute': {mla_impute : {'method': "median"}}})
                                     ]
                        
                     },
    #Expects flat dataset
    'feature_engineering': {
                            'transformations': DagLayer({
                                    ('Department', 'EducationField', 'JobRole', "MaritalStatus"): {pd.get_dummies: None},
                                    'BusinessTravel': {create_ordinal_df: {'ordinal_dict':{"Non-Travel" :0, 
                                                                                            "Travel_Rarely":1, 
                                                                                            "Travel_Frequently": 2},
                                                                          'tag': None}},
                                    ('Attrition', 'Gender', 'OverTime') : {create_boolean_df: 
                                                                           {'boolean_names_and_values':
                                                                            {"Attrition": ["Yes", "No"],
                                                                            "Gender": ["Male", "Female"],
                                                                            "OverTime": ["Yes", "No"]}, 
                                                                            'tag' : None}},
                                    ('Age', 'DistanceFromHome', "EnvironmentSatisfaction",
                                    'JobInvolvement', 'JobSatisfaction', "MonthlyIncome",
                                    'NumCompaniesWorked', 'PercentSalaryHike', "PerformanceRating",
                                    'RelationshipSatisfaction', 'TotalWorkingYears',
                                    'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
                                    'YearsInCurrentRole', 'YearsSinceLastPromotion',
                                    'YearsWithCurrManager') : {normalize_values: {'tag': None}},
                                    'MonthlyRate': {mla_linear_transformation: {'method': boxcox},
                                                    normalize_values: {'tag': None}},
                               })
                            
                           },
 
     #Expects predictions
     'evaluation':{'metrics' : DagLayer({'acc': {accuracy_score: None}, 
                                        'f1': {f1_score: None}, 
                                        'precision': {precision_score: None},
                                        'conf_matrix': {confusion_matrix: None}, 
                                        'class_report': {classification_report: None},
                                        'recall': {recall_score :None}})
                  }
   

}

In [2]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/SamShowalter/WMP_training/01_EDA/data/IBM_Employee_Attrition.csv')

In [3]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   1
Department                  0
DistanceFromHome            2
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              1
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              1
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               2
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [41]:
from sklearn.datasets import load_iris

In [42]:
df = pd.DataFrame(load_iris().data, columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"])
df['flower_label'] = load_iris().target

In [44]:
flower_dict = {0:'setosa', 1:'versicolor', 2:'virginica'}
df['flower_label'] = df.flower_label.replace(flower_dict)
#df.to_csv("airbender_iris_demo.csv", index = False)

In [49]:
df.flower_label

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: flower_label, Length: 150, dtype: object

In [46]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(data, prefit = None):

    le = None

    if prefit:
        
        data = prefit['label_encoder'].transform(data)
        return data

    else:
        le = LabelEncoder()
        data = le.fit_transform(data)

        return data, {'label_encoder': le}

In [48]:
encode_labels(df.flower_label)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 {'label_encoder': LabelEncoder()})

In [56]:
# tt, artifacts = mla_winsorize(df.MonthlyRate)
# ss, artifacts = mla_linear_transformation(tt, method = boxcox)
# rr, artifacts = normalize_values(ss)
from datetime import datetime
datetime.t

AttributeError: 'datetime.datetime' object has no attribute 'datetime'

In [26]:
df = pd.DataFrame()
df.sum()

Series([], dtype: float64)

AttributeError: module 'pandas' has no attribute 'sum'

In [6]:
def mla_impute(data, method = "median", prefit = None):

	fill_na_vals = None
	if prefit:
		fill_na_vals = prefit['fill_na_vals']

	else:
		if method == 'median':
			fill_na_vals = data.median()
		elif method == 'mean':
			fill_na_vals = data.mean()

	data = data.fillna(fill_na_vals)

	if prefit:
		return data
	else:
		return data, {'fill_na_vals': fill_na_vals}


In [7]:
mla_impute(df)

(      Age Attrition     BusinessTravel  DailyRate              Department  \
 0      41       Yes      Travel_Rarely     1102.0                   Sales   
 1      49        No  Travel_Frequently      279.0  Research & Development   
 2      37       Yes      Travel_Rarely     1373.0  Research & Development   
 3      33        No  Travel_Frequently     1392.0  Research & Development   
 4      27        No      Travel_Rarely      591.0  Research & Development   
 ...   ...       ...                ...        ...                     ...   
 1465   36        No  Travel_Frequently      884.0  Research & Development   
 1466   39        No      Travel_Rarely      613.0  Research & Development   
 1467   27        No      Travel_Rarely      155.0  Research & Development   
 1468   49        No  Travel_Frequently     1023.0                   Sales   
 1469   34        No      Travel_Rarely      628.0  Research & Development   
 
       DistanceFromHome  Education EducationField  EmployeeCou

In [8]:
import pandas as pd
import os
import sys
sys.path.append("../ml_airflow/static")
sys.path.append("../ml_airflow/dag")

In [9]:
from generator import DagGenerator
from datetime import datetime



In [10]:
dag_args = { 
                'dag_name': "ML_Airflow_test_creation_full",
                'dag':      {
                                'owner': 'Sam Showalter',
                                'email': ['sshowalter@wmp.com'],
                                'op_args':{},
                                'op_kwargs': {},
                                'params': {}
                            },
                'config' : kwargs
}

In [11]:
dg = DagGenerator(dag_args)
dg.generate_file()

NameError: name 'op_name' is not defined