# Description
Below is a notebook where I tested some functions to input into the backend of the web app in order to pre-process user inputs


In [1]:
# Imports
# Importing basic libraries for data analytics
import numpy as np
import pandas as pd

# Libraries for visualisations
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.stats import norm
import scipy
import matplotlib.mlab as mlab
%matplotlib inline

# Import supplementary visualization code visuals.py
import visuals as vs

# Used for tracking model training and testing time
from time import time

# Importing 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

# Import sklearn.preprocessing.StandardScaler for feature scaling
from sklearn.preprocessing import MinMaxScaler
# Import train_test_split
from sklearn.model_selection import train_test_split
# Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score
# Importing Precision function
from sklearn.metrics import precision_score


# Importing the three supervised learning models from sklearn
from sklearn.ensemble import AdaBoostClassifier


# Importing pickle to help with model saving and loading
import pickle


In [2]:
def get_inputs():

    input_vals = 25,'Federal-gov','Bachelors',10,'Married-spouse-absent','Sales','Husband','Black','Male',40000,1200,50,'Canada'

    return input_vals

In [3]:
def load_data():
    # List of column names to pass to the read_csv function
    col_names = ['age','workclass','fnlwgt','education','education_years','marital_status','occupation','realtionship','race',
         'sex','capital_gain','capital_loss','hours_worked_per_week','native_country','income']

    # Loading in our training data
    data = pd.read_csv('adultdata', names=col_names)

    # Loading in the testing data
    data_test = pd.read_csv('adulttest', names=col_names, skiprows=1)

    # Merging our dataframes
    merged_data = pd.concat([data,data_test],ignore_index = True)

    return merged_data

In [4]:
def return_scaler_header(data):
    '''
    INPUT:
    data - Pandas Dataframe of our raw data

    OUTPUT:
    fitted_scaler - return the scaler object 
    features_headers - list of pandas headers for cleaned dataframe

    Description:
    returns a scalar object and headers used to train data to use on the inputs of the web app
    '''
    # Removing the fnlwgt column in one line
    data.drop(['fnlwgt'], axis = 1, inplace = True)

    # indexing the ' ?' rows
    q_indx = list(np.nonzero(data.isin([' ?']).sum(axis=1)>0)[0])
    # Storing the drop
    data.drop(axis=0, index = q_indx, inplace =True)

    # list of all continuous features
    cont_feat = list(data.dtypes[data.dtypes == 'int64'].index)

    # creating a list of of column names
    skewed_cols = ['capital_gain','capital_loss']

    # Log transforming the skewed features
    features_log_transformed = pd.DataFrame(data = data)
    features_log_transformed[skewed_cols] = data[skewed_cols].apply(lambda x: np.log(x + 1))


    # Initialize a scaler, then apply it to the features with a default of 0 -> 1
    scaler = MinMaxScaler()

    # Copying the dataframe over to a new one
    features_log_minmax_transform = features_log_transformed.copy()
    
    # Transforming our new dataframes continuous features
    fitted_scaler = scaler.fit(features_log_transformed[cont_feat])
    features_log_minmax_transform[cont_feat] = fitted_scaler.transform(features_log_transformed[cont_feat])
    
    
    # Split the data into features and target label
    features_log_minmax_transform.drop('income', axis = 1, inplace = True)
    
    
    
    # One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
    features_final = pd.get_dummies(features_log_minmax_transform)
    features_final.columns = features_final.columns.str.replace(' ', '')
    features_headers = list(features_final.columns)
    

    return fitted_scaler, features_headers

In [5]:
def clean_inputs(data, fitted_scaler):
    '''
    INPUT:
    data - Pandas Dataframe of our raw inputs
    fitted_scaler - scaler object used in our clean_data function

    OUTPUT:
    features_final - Pandas dataframe of our processed input features

    Description:
    Cleans the data in a pipeline process
    '''

    cont_feat = ['age',
     'education_years',
     'capital_gain',
     'capital_loss',
     'hours_worked_per_week']

    for feat in cont_feat:
        data[feat] = data[feat].astype(str).astype(int)

    # creating a list of of column names
    skewed_cols = ['capital_gain','capital_loss']

    # Log transforming the skewed features
    features_log_transformed = pd.DataFrame(data = data)
    features_log_transformed[skewed_cols] = data[skewed_cols].apply(lambda x: np.log(x + 1))

    # Copying the dataframe over to a new one
    features_log_minmax_transform = features_log_transformed.copy()
    features_log_minmax_transform[cont_feat] = fitted_scaler.transform(features_log_transformed[cont_feat])

    # One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
    features_final = pd.get_dummies(features_log_minmax_transform)
    features_final.columns = features_final.columns.str.replace(' ', '')

    return features_final
    

In [6]:
def matching_dataframe_for_input(clean_inputs, column_headers):
    '''
    INPUT:
    clean_inputs - pandas dataframe of our cleaned inputs
    column_headers - column headers of our features dataframe we used for training our model 
    
    OUTPUT:
    input_df - Pandas dataframe of our processed input features with the correct column headers
    
    Description:
    Cleans the data in a pipeline process
    '''
    
    # Empty Dataframe with features headers
    input_df = pd.DataFrame(index=[0], columns = column_headers)
    # Passing values to our empty dataframe
    for col in list(clean_inputs.columns):
        input_df[col] = clean_inputs[col][0]
    # Filling the Na's with 0
    input_df.fillna(0, inplace=True)
    
    return input_df

In [7]:
# Returning merged data
merged_data = load_data()

In [8]:
# Getting the scaler object and cleaned feature headers
fitted_scaler, features_headers = return_scaler_header(merged_data)

  return getattr(obj, method)(*args, **kwds)
  return self.partial_fit(X, y)


In [9]:
inputs = get_inputs()

In [10]:
# Getting our inputs in a dataframe format
df_inputs = pd.DataFrame(index = [0], columns = list(merged_data.columns[0:13]))
df_inputs.loc[0,:] = inputs

In [11]:
# Scaling our dataframe
scaled_inputs = clean_inputs(df_inputs, fitted_scaler)

In [12]:
scaled_inputs

Unnamed: 0,age,education_years,capital_gain,capital_loss,hours_worked_per_week,workclass_Federal-gov,education_Bachelors,marital_status_Married-spouse-absent,occupation_Sales,realtionship_Husband,race_Black,sex_Male,native_country_Canada
0,0.109589,0.6,0.920414,0.846217,0.5,1,1,1,1,1,1,1,1


In [13]:
# Matching the headers
clean_input_df = matching_dataframe_for_input(scaled_inputs, features_headers)

In [28]:
# Checking the new data frame
clean_input_df

Unnamed: 0,age,education_years,capital_gain,capital_loss,hours_worked_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,0.109589,0.6,0.920414,0.846217,0.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Exporting the highest accuracy model
filename = 'model.pkl'
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [31]:
loaded_model.predict(clean_input_df)[0]

0