In [1]:
import inspect

def is_fitted(model):
    """Checks if model object has any attributes ending with an underscore"""
    return 0 < len( [k for k,v in inspect.getmembers(model) if k.endswith('_') and not k.startswith('__')] )

In [2]:
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing.data import StandardScaler
from sklearn.naive_bayes import GaussianNB
import json
from airflow.operators.python_operator import PythonOperator
from collections import Counter
import pandas


In [3]:
def import_dynamically(obj):
    import_statement = None
    try:
        import_statement = "from {} import {}\n".format(
                                                      obj.__module__,
                                                      obj.__name__)
    except Exception as e:
            print("Dynamic import failed. Trying simpler import.")
            import_statement = 'import {}\n'.format(obj.__name__)
    
    return import_statement
    
import_dynamically(GaussianNB)
import_dynamically(LogisticRegression)

'from sklearn.linear_model.logistic import LogisticRegression\n'

In [4]:
# cd /mnt/c/Users/<ubuntu.username>/Pictures
import pandas as pd
pd.get_dummies.__module__
import pandas
from scipy.stats import *
from pandas.core.reshape.reshape import get_dummies
import inspect


In [5]:
# import_dynamically(find_missing_data)
# import_dynamically(normalize_values)

In [6]:
import sys
sys.path.append("../repos/WMP_ML_MVP/")

from wmp_ml.static.missing_data import *
from wmp_ml.static.feature_engineering import *

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')

# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

In [36]:
'''
dictionary = something belongs to something else, is kwargs, or has certain order
list = collection of several parallel operations

blocks -> first level keys (THESE HAVE SET ORDER)
Below this ^^^ -> The order you provide the code will matter
    - dict = matters
    - list = does not matter

'''
from sklearn.preprocessing import *
from sklearn.model_selection import *
from scipy.stats import *
import pandas as pd
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

kwargs = \
{   'meta': {
             'doc': '''# Sam Showalter's Generated DAG
If this ever compiles in airflow, POP THA CHAMPAGNE!

## Objective: Classification
'''},
     'modeling': { 
                    'models': {LogisticRegression: None, GaussianNB: None},              
                 },
     'data_sources': {
                     pd.read_csv:{'https://raw.githubusercontent.com/SamShowalter/WMP_training/01_EDA/data/IBM_Employee_Attrition.csv': None,
                                  } 
                     },
 
     
     #Expects flat dataset
     'splitting': {
                        train_test_split  : {#'target': "Attrition",
                                                  'test_size': 0.2,
                                                  'random_state': 42},
                  },
 
     
     #Expects flat dataset
     'preprocessing': { 
                        'missing_data': {
                                              'default': "impute_median"                                          
                                        },
                        'outliers': {   
                                              'detect': {'threshold': 6},
                                              'default': "winsorize"
                                    }
                        
                     },
 
    #Expects flat dataset
    'feature_engineering': {
                            'transformations': {
                                                    ('Department', 'EducationField', 'JobRole', "MaritalStatus"): {pd.get_dummies: None},
                                                    'BusinessTravel': {create_ordinal_df: {'ordinal_names':{"Non-Travel" :0, "Travel_Rarely":1, "Travel_Frequently": 2}}},
                                                    ('Attrition', 'Gender', 'OverTime') : {convert_boolean_df: {'boolean_names_and_values':[["Attrition", "Yes", "No"],
                                                                                                                  ["Gender", "Male", "Female"],
                                                                                                                   ["OverTime", "Yes", "No"]], 
                                                                                                                'tag' : None}},
                                                    ('Age', 'DistanceFromHome', 'EmployeeCount', "EnvironmentSatisfaction",
                                                    'JobInvolvement', 'JobSatisfaction', "MonthlyIncome",
                                                    'NumCompaniesWorked', 'PercentSalaryHike', "PerformanceRating",
                                                    'RelationshipSatisfaction', 'StandardHours', 'TotalWorkingYears',
                                                    'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
                                                    'YearsInCurrentRole', 'YearsSinceLastPromotion',
                                                    'YearsWithCurrManager') : {normalize_values: {'transformer': StandardScaler, 
                                                                                                  'tag': None}},
                                                    'MonthlyRate': {yeojohnson: None,
                                                                    normalize_values: {'transformer': StandardScaler,
                                                                                       'tag': None}},
                                               },
                            
                           },
 
     #Expects predictions
     'evaluation':{'metrics' : {accuracy_score: None, f1_score: None, precision_score: None, confusion_matrix: None, 
                   classification_report: None, recall_score :None},
                   'visualization' : None}
   
}

In [9]:
def rec_imports(config_section, import_list, import_check):
    
    try:
        for key in config_section.keys():
            #print(key)
            if any([inspect.isclass(key),
                   inspect.isfunction(key),
                   inspect.ismodule(key)]) and key not in import_check:
                import_list.append(import_dynamically(key))
                import_check.add(key)
            value = config_section[key]
            
            if any([inspect.isclass(value),
                   inspect.isfunction(value),
                   inspect.ismodule(value)]) and value not in import_check:
                import_list.append(import_dynamically(value))
                import_check.add(value)
            
            rec_imports(config_section[key], import_list, import_check)
    
    except Exception as e:
        return ""
            

def write_imports(config):
    import_list = []
    import_check = set()
    for key in config:
        if isinstance(config[key], dict):
            rec_imports(config[key], import_list, import_check)
    
    return "".join([item for item in import_list if item != ""])

print(write_imports(kwargs))

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from pandas.io.parsers import read_csv
from sklearn.model_selection._split import train_test_split
from pandas.core.reshape.reshape import get_dummies
from wmp_ml.static.feature_engineering import create_ordinal_df
from wmp_ml.static.feature_engineering import convert_boolean_df
from wmp_ml.static.feature_engineering import normalize_values
from sklearn.preprocessing.data import StandardScaler
from scipy.stats.morestats import yeojohnson
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics.classification import f1_score
from sklearn.metrics.classification import precision_score
from sklearn.metrics.classification import confusion_matrix
from sklearn.metrics.classification import classification_report
from sklearn.metrics.classification import recall_score



In [10]:
d = {}
d['test'] = ['something']
d.setdefault('test', []).append('new_something')
d.setdefault('astest', []).append('new_something')
d

{'test': ['something', 'new_something'], 'astest': ['new_something']}

In [11]:
# kwarg_dict = collections.OrderedDict(kwargs)
# # for item in kwarg_dict.keys():
# #     print(kwarg_dict[item])

# #test = collections.OrderedDict(kwarg_dict['preprocessing'])
# for key in kwarg_dict['preprocessing']:
#     if isinstance(key, tuple):
#         for item in key:
#             print(item)
#     else:
#         print(kwarg_dict['preprocessing'][key])
        


In [12]:
df = pd.read_csv('https://raw.githubusercontent.com/SamShowalter/WMP_training/01_EDA/data/IBM_Employee_Attrition.csv')

In [13]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102.0,Sales,1.0,2,Life Sciences,1,1.0,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279.0,Research & Development,8.0,1,Life Sciences,1,2.0,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373.0,Research & Development,2.0,2,Other,1,4.0,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392.0,Research & Development,3.0,4,Life Sciences,1,5.0,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591.0,Research & Development,2.0,1,Medical,1,7.0,...,4,80,1,6,3,3,2,2,2,2


In [14]:
#Provided function for detecting outliers
def detect_outliers(col_names, data, std_thresh = 6):
    
    for column in col_names:
        
        #Create z_score proxy for each column
        data['z_score'] = np.absolute(zscore(data[column]))
        
        #Determine if there are outliers, as defined by z_score threshold
        outliers = data.loc[data.z_score > std_thresh, [column, 'z_score']]
        
        #If there are no outliers
        if outliers.shape[0] == 0:
            print("No outliers for column {} at threshold of {} stdevs".format(column, std_thresh))
        
        #If there are outliers
        else:
            print("\n {} outlier(s) found for column {} at threshold of {} stdevs. See below".format(outliers.shape[0],
                                                                                        column, std_thresh))
            print("\n")
            print(outliers)
            print("\n")
        
        #Drop z_score from data
        data.drop('z_score', axis = 1, inplace = True)

In [30]:
LogisticRegression.__dict__

mappingproxy({'__module__': 'sklearn.linear_model.logistic',
              '__doc__': 'Logistic Regression (aka logit, MaxEnt) classifier.\n\n    In the multiclass case, the training algorithm uses the one-vs-rest (OvR)\n    scheme if the \'multi_class\' option is set to \'ovr\', and uses the\n    cross-entropy loss if the \'multi_class\' option is set to \'multinomial\'.\n    (Currently the \'multinomial\' option is supported only by the \'lbfgs\',\n    \'sag\', \'saga\' and \'newton-cg\' solvers.)\n\n    This class implements regularized logistic regression using the\n    \'liblinear\' library, \'newton-cg\', \'sag\', \'saga\' and \'lbfgs\' solvers. **Note\n    that regularization is applied by default**. It can handle both dense\n    and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit\n    floats for optimal performance; any other input format will be converted\n    (and copied).\n\n    The \'newton-cg\', \'sag\', and \'lbfgs\' solvers support only L2 regulariza

In [16]:
import pandas as pd
import os
import sys
sys.path.append("../repos/WMP_ML_MVP/wmp_ml/static")
sys.path.append("../repos/WMP_ML_MVP/wmp_ml/orch")

In [17]:
from dag_generator import DagGenerator
from datetime import datetime

In [18]:
print("Here {} not {{here}}".format("info", {"here": "It worked"}))

Here info not {here}


In [19]:
dag_args = { 
                'dag_name': "WMP_ML_test_creation",
                'dag':      {
                                'owner': 'Sam Showalter',
                                'email': ['sshowalter@wmp.com'],
                                'op_args':{},
                                'op_kwargs': {},
                                'params': {}
                            },
                'config' : kwargs
}

In [20]:
dg = DagGenerator(dag_args)
dg.generate_file()

In [21]:
dd ={
    "data_sources": 0,
    "eda": 1,
    "splitting": 2,
    "preprocessing": 3,
    "feature_engineering": 4,
    "modeling": 5,
    "evaluation": 6,
    "visualization": 7,
    "storage": 8
}

In [22]:
import json

def set_default(obj):
    if isinstance(obj, set):
        return list(obj)
    raise TypeError

with open('../repos/WMP_ML_MVP/wmp_ml/config/dag_hierarchy.cfg', 'w') as config:
    config.write(json.dumps(dd, indent=4))
    config.close()

In [23]:
print(os.getcwd())

/home/sshowalter/notebooks


In [24]:
a = 'adsf'


In [25]:
a += "mareae"
a

'adsfmareae'

In [26]:
config = {   'splitting': {2: ['train_test_split', 'k_fold']},
             'data_souces': {0: {'csv': 0, 'sql': 1, 'pickle': 2}},
             'preprocessing': {3: {0: 'outliers', 1: 'missing_data'}},
             'evaluation': {6: ['accuracy',
               'f1',
               'confusion_matrix',
               'classification_report',
               'precision',
               'recall']},
             'eda': {1: {'profiling': 0}},
             'modeling': {5: {'hyperparameter_tuning': {}, 'models': {}}},
             'storage': {8: {'model': 'FILEPATH',
               'pipeline': 'FILEPATH',
               'performance': 'FILEPATH',
               'data': 'FILEPATH'}},
             'feature_engineering': {4: {'transformations': 0,
               {'dimensionality_reduction': {1: ['pca', 'svd']}}},
             'visualization': {7: ['AUROC']}}

SyntaxError: invalid syntax (<ipython-input-26-c7ee7738d0c2>, line 17)

In [None]:
def print_exec_hierarchy(hier, config):
    exec_plan = {}
    
    if 
    for key in hier.keys():
        exec_plan
    