In [None]:
from flask import Flask, jsonify, request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import impyute

import os
from tqdm import tqdm
import datetime

from pickle import load
from sklearn.metrics import f1_score


import flask
Predictive_equipment_failure_model_productionization = Flask(__name__)



'''
TO STORE THE COLUMN NAMES AS THEY ARE USEFUL TO EXTRACT THE DATAFRAMES WHILE DOING DATA PRE-PROCESSING
'''
column_names_less_than_5_percent = ['sensor9_measure','sensor12_measure','sensor13_measure','sensor16_measure','sensor17_measure','sensor18_measure','sensor28_measure','sensor31_measure','sensor33_measure','sensor34_measure','sensor35_measure','sensor49_measure','sensor59_measure','sensor60_measure','sensor89_measure','sensor90_measure','sensor98_measure','sensor99_measure', 'sensor12_into_sensor13_measure', 'sensor12_minus_sensor13_measure', 'sensor35_into_sensor17_measure']


column_names_for_etr_imputer = ['sensor3_measure', 'sensor56_measure','sensor62_measure','sensor81_measure','sensor82_measure','sensor103_measure', 'sensor81_into_sensor82_measure']


column_names_for_ridge_regression_imputer = ['sensor36_measure']


top_25_column_names = ['sensor9_measure','sensor12_measure','sensor13_measure','sensor16_measure','sensor17_measure','sensor18_measure','sensor28_measure','sensor31_measure','sensor33_measure','sensor34_measure','sensor35_measure','sensor49_measure','sensor59_measure','sensor60_measure','sensor89_measure','sensor90_measure','sensor98_measure','sensor99_measure','sensor3_measure', 'sensor56_measure','sensor62_measure','sensor81_measure','sensor82_measure','sensor103_measure', 'sensor36_measure']


original_df = pd.read_csv(r'equip_failures_training_set.csv')

original_df.drop(['id', 'target'], axis = 1, inplace=True)

column_names_for_input_data = original_df.columns



'''
TO START LOADING THE DATA PRE-PROCESSING MODELS AND OTHER MODELS
'''
etr_itr_imputer = load(open(r'etr_itr_imputer.pkl', 'rb'))

ridge_regression_imputer = load(open(r'ridge_regression_imputer.pkl', 'rb'))

measures_data_scaler = load(open(r'measures_data_scaler.pkl', 'rb'))

histogram_data_scaler = load(open(r'histogram_data_scaler.pkl', 'rb'))


# TO LOAD THE BASE MODELS AND STORE THEM IN A LIST
loaded_base_models_list = []

for i in range(500):   # as there are 500 base models
    model = load(open(r'decision_tree_{}.pkl'.format(i), 'rb'))
    loaded_base_models_list.append(model)

    
meta_classifier = load(open(r'xgboost_model.pkl', 'rb'))

column_names_per_base_model = load(open(r'column_names_for_each_base_model.pkl', 'rb'))



'''
TO CLEAN THE DATA AND PERFORM FEATURE ENGINEERING AND RETURN THE FINAL DATAPOINT WHICH WILL BE THE INPUT FOR THE MODEL
'''
def clean_input_data(input_data, etr_itr_imputer, ridge_regression_imputer, histogram_data_scaler, loaded_base_models_list, meta_classifier, column_names_less_than_5_percent, column_names_for_etr_imputer, column_names_for_ridge_regression_imputer, measures_data_scaler, column_names_per_base_model, top_25_column_names):

    # to replace the "na" values with "np.NAN" values
    input_data = input_data.replace('na', np.NAN)

    
    # as all the values are numbers, to convert their data-type into float
    input_data = input_data.astype('float')

    
    '''
    to extract required column names from the input_data and perform the required data-preprocessing tasks on
    respective data
    '''
    # to separate measures data and histogram data
    measures_df_column_names = []
    histogram_df_column_names = []

    for i in input_data.columns:
        if "histogram" not in i:
            measures_df_column_names.append(i)
        else:
            histogram_df_column_names.append(i)

    # to extract the data
    measures_data = input_data[measures_df_column_names]
    histogram_data = input_data[histogram_df_column_names]


    # to extract the 'top_25_column_names' from the measures data
    measures_data = measures_data[top_25_column_names]

    
    # to add the engineered features
    measures_data['sensor12_into_sensor13_measure'] =  measures_data['sensor12_measure'] * measures_data['sensor13_measure']
    measures_data['sensor12_minus_sensor13_measure'] = measures_data['sensor12_measure'] - measures_data['sensor13_measure']
    measures_data['sensor35_into_sensor17_measure'] = measures_data['sensor35_measure'] * measures_data['sensor17_measure']
    measures_data['sensor81_into_sensor82_measure'] = measures_data['sensor81_measure'] * measures_data['sensor82_measure']

    
    # to extract the column names from measures data at this step
    revised_measures_data_column_names = measures_data.columns

    
    # to transform the "measures_data" using "measures_data_scaler"
    measures_data = measures_data_scaler.transform(measures_data)

    
    # to convert it into DATAFRAME again and assign column names
    measures_data = pd.DataFrame(measures_data)
    measures_data.columns = revised_measures_data_column_names


    # to extract the corresponding column names from the measures data and impute the missing values
    measures_df_less_than_5_percent = measures_data[column_names_less_than_5_percent]
    measures_df_less_than_30_percent = measures_data[column_names_for_etr_imputer]
    measures_df_less_than_75_percent = measures_data[column_names_for_ridge_regression_imputer]

    
    # to replace the NAN values from the "measures_dfLess_than_5_percent" by 0
    measures_df_less_than_5_percent = measures_df_less_than_5_percent.replace(np.NAN, 0)

    
    # to extract the column names
    measures_df_less_than_30_percent_column_names = measures_df_less_than_30_percent.columns
    measures_df_less_than_75_percent_column_names = measures_df_less_than_75_percent.columns


    # to use EXTRATREESREGRESSOR for data imputation for "measures_df_less_than_30_percent"
    transformed_measures_df_less_than_30_percent = etr_itr_imputer.transform(measures_df_less_than_30_percent)


    # to use RIDGE REGRESSION IMPUTER for data imputation for "measures_df_less_than_75_percent"
    transformed_measures_df_less_than_75_percent = ridge_regression_imputer.transform(measures_df_less_than_75_percent)


    # to CONVERT the "measures_df_less_than_30_percent" and "measures_df_less_than_75_percent" into DATAFRAMES
    transformed_measures_df_less_than_30_percent = pd.DataFrame(transformed_measures_df_less_than_30_percent, index=measures_df_less_than_5_percent.index)
    transformed_measures_df_less_than_75_percent = pd.DataFrame(transformed_measures_df_less_than_75_percent, index=measures_df_less_than_5_percent.index)

    
    # to assign repective column names to the dataframes
    transformed_measures_df_less_than_30_percent.columns = measures_df_less_than_30_percent_column_names
    transformed_measures_df_less_than_75_percent.columns = measures_df_less_than_75_percent_column_names


    '''
    to merge the measures data as it has completed its DATA-PREPROCESSING STAGE
    '''
    final_measures_data = measures_df_less_than_5_percent.merge(transformed_measures_df_less_than_30_percent, right_index=True, left_index=True)
    final_measures_data = final_measures_data.merge(transformed_measures_df_less_than_75_percent, right_index=True, left_index=True)


    '''
    to start PRE-PROCESSING the HISTOGRAM DATA
    '''
    histogram_data_column_names = histogram_data.columns

    # to scale the "histogram data" using histogram_data_scaler
    final_histogram_data = histogram_data_scaler.transform(histogram_data)

    
    # to convert the histogram data into a DATAFRAME
    final_histogram_data = pd.DataFrame(final_histogram_data, index = measures_df_less_than_5_percent.index)

    
    # to assign column names to the dataframe
    final_histogram_data.columns = histogram_data_column_names

    '''
    as in the HISTOGRAM DATA, we had only around 1% of the values as NAN values and we removed those, we will now
    replace those values with 0 because most of them have a value of 0 and only a few of them have a very large value
    '''
    final_histogram_data = final_histogram_data.replace(np.NAN, 0)


    '''
    As we have finished the pre-processing part for histogram data as well, we will now combine both the "measures" as
    well as "histogram" data to create the final input data
    '''
    final_input_data = final_measures_data.merge(final_histogram_data, left_index=True, right_index=True)

    return final_input_data



'''
TO DEFINE THE BEHAVIOUR OF THE FLASK API
'''
@Predictive_equipment_failure_model_productionization.route('/')
def hello_world():
    return '''Please use the word "index" after "/"'''


@Predictive_equipment_failure_model_productionization.route('/index')
def index():
    return flask.render_template('index.html')


@Predictive_equipment_failure_model_productionization.route('/predict', methods=['POST'])
def predict():
    to_predict_list = request.form.to_dict()
    
    # to collect the input taken from the user (FROM WEB PAGE)
    data_points = to_predict_list['review_text']
    
    
    # to separate multiple datapoints given as input by the ":" symbol
    data_points = data_points.split(':')
    
    
    # to separate each datapoint based on "," to create a corresponding list
    data_points_list = []
    
    for i in data_points:
        values = i.split(',')
        data_points_list.append(values)

        
    # to convert it into DataFrame    
    input_df = pd.DataFrame(data_points_list)
    input_df.columns = column_names_for_input_data
    
    
    # to replace all the alphabetic inputs by np.NAN
    input_df = input_df.replace(r'[^-]\D', np.NAN, regex=True)
    
    
    # to convert the datapoint into type "float"
    input_df = input_df.astype('float')
    
    
    # to check for number of negative values in each column
    output = input_df.lt(0).sum()
    
    
    # to create a temporary list that store the index value of we have a negative value in the dataframe
    temp_list = []
    
    for i, value in enumerate(output):
        if value > 0:
            temp_list.append(i)
            
                    
    if len(temp_list) > 0:
        return 'Please enter POSITIVE VALUES or NAN VALUES for features {}'.format([i+1 for i in temp_list])
    
    
    # to pass the datapoint into the cleaning data pipeline only if we have the non-negative numbers as inputs
    if len(temp_list) == 0:
        final_input_data = clean_input_data(input_df, etr_itr_imputer, ridge_regression_imputer, histogram_data_scaler, loaded_base_models_list, meta_classifier, column_names_less_than_5_percent, column_names_for_etr_imputer, column_names_for_ridge_regression_imputer, measures_data_scaler, column_names_per_base_model, top_25_column_names)
        
        
        '''
        Before passing the data-point to the base-estimators, each base-estimator takes 59 columns from the input data
        point. These column names for each base estimators are fixed based and so we have stored those column names
        and are utilizing those to extract the perfect dataset corresponding to each base estimator
        '''
        sampled_datasets_list = []

        for i in range(500):
            sampled_data_points = final_input_data[column_names_per_base_model[i]]
            sampled_datasets_list.append(sampled_data_points)

        '''
        now to pass this data through the base models and extract the outputs of these base models and store them in an
        array. This array will then act as the input data for the "meta_classifier"
        '''
        results_from_base_model = []

        # to pass this "final_input_data" into the base models
        for i, base_model in enumerate(loaded_base_models_list):
            prediction = base_model.predict(sampled_datasets_list[i])
            results_from_base_model.append(prediction)


        dataset_for_meta_classifier = np.vstack(results_from_base_model)
        dataset_for_meta_classifier = dataset_for_meta_classifier.T
        
        
        '''
        now to pass this "dataset_for_meta_classifier" as input to the "META-CLASSIFIER" and get the FINAL OUTPUT
        '''

        meta_classifier_output = meta_classifier.predict(dataset_for_meta_classifier)
        
        
        output_list = []
        
        for i in meta_classifier_output:
            if i == 0:
                output_list.append('We have SURFACE FAILURE')
            if i == 1:
                output_list.append('We have DOWNHOLE FAILURE')
                
        
    return "{}".format([i for i in output_list])
        

if __name__ == '__main__':
    Predictive_equipment_failure_model_productionization.run(host='0.0.0.0', port=8080)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:8080/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Dec/2020 09:26:56] "[37mGET /index HTTP/1.1[0m" 200 -


[IterativeImputer] Completing matrix with shape (1, 7)
[IterativeImputer] Ending imputation round 1/4, elapsed time 0.12
[IterativeImputer] Ending imputation round 2/4, elapsed time 0.23
[IterativeImputer] Ending imputation round 3/4, elapsed time 0.35
[IterativeImputer] Ending imputation round 4/4, elapsed time 0.46


127.0.0.1 - - [03/Dec/2020 09:27:11] "[37mPOST /predict HTTP/1.1[0m" 200 -
