In [1]:
! git clone https://github.com/Virajdatt/TFX-ML-Pipeline.git

fatal: destination path 'TFX-ML-Pipeline' already exists and is not an empty directory.


In [15]:
"""
Downloads the csv data
"""

import logging
import os
import shutil

import pandas as pd
import urllib3

# Initial dataset source
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"

# Initial local dataset location
LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv"


def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path
    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    # disable insecure https warning
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    c = urllib3.PoolManager()
    with c.request("GET", url, preload_content=False) as res, open(
        LOCAL_FILE_NAME, "wb"
    ) as out_file:
        shutil.copyfileobj(res, out_file)
    logging.info("Download completed.")


def create_folder():
    """Creates a data folder if it doesn't exist.
    Returns:
        None
    """
    directory = "data/"
    if not os.path.exists(directory):
        os.makedirs(directory)
        logging.info("Data folder created.")
    else:
        logging.info("Data folder already existed.")


def check_execution_path():
    """Check if the function and therefore all subsequent functions
        are executed from the root of the project
    Returns:
        boolean -- returns False if execution path isn't the root,
            otherwise True
    """
    file_name = "LICENSE"
    if not os.path.exists(file_name):
        logging.error(
            "Don't execute the script from a sub-directory. "
            "Switch to the root of the project folder"
        )
        return False
    return True


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    logging.info("Started download script")
    create_folder()
    download_dataset()

    logging.info("Finished download script")

## In this file we are going to look at Tensorflow data Valiadtion package.

The Validation here can mean 3 things (from the text-book):
1. Check for Data Anomalies.
2. Check taht data schema has not changed (since model was built and deployed)
3. Check that the statistics of our new datasets still align with statistics from our previous training datasets.

## Data-Drift
Data Drift means that your newly collected data has different underlying statistics than the initial dataset used to train your model.

In [2]:
!pip install -U tfx



In [18]:
import tensorflow_data_validation as tfdv
import pandas as pd
from sklearn.model_selection import train_test_split


In [5]:
stats = tfdv.generate_statistics_from_tfrecord('TFX-ML-Pipeline/data/consumer-complaints.tfrecords')



Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


## For numerical features, TFDV computes for every feature:
- The overall count of data records
- The number of missing data records
- The mean and standard deviation of the feature across the data records
- The minimum and maximum value of the feature across the data records
- The percentage of zero values of the feature across the data records

## In addition, it generates a histogram of the values for each feature. For categorical features, TFDV provides:
- The overall count of data records
- The percentage of missing data records
- The number of unique records
- The average string length of all records of a feature
- For each category, TFDV determines the sample count for each label and its rank

In [7]:
#stats

In [8]:
schema = tfdv.infer_schema(stats)

In [9]:
type(schema)

tensorflow_metadata.proto.v0.schema_pb2.Schema

In [11]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'company',BYTES,required,,-
'company_response',STRING,required,,'company_response'
'consumer_disputed',BYTES,required,,-
'issue',STRING,required,,'issue'
'product',STRING,required,,'product'
'state',STRING,required,,'state'
'sub_issue',STRING,required,,'sub_issue'
'sub_product',STRING,required,,'sub_product'
'timely_response',STRING,required,,'timely_response'
'zip_code',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'company_response',"'Closed', 'Closed with explanation', 'Closed with monetary relief', 'Closed with non-monetary relief', 'Untimely response'"
'issue',"'APR or interest rate', 'Account opening, closing, or management', 'Account terms and changes', 'Adding money', 'Advertising and marketing', 'Advertising, marketing or disclosures', 'Application processing delay', 'Application, originator, mortgage broker', 'Applied for loan/did not receive money', 'Arbitration', 'Balance transfer', 'Balance transfer fee', 'Bankruptcy', 'Billing disputes', 'Billing statement', 'Can\'t contact lender', 'Can\'t repay my loan', 'Can\'t stop charges to bank account', 'Cash advance', 'Cash advance fee', 'Charged bank acct wrong day or amt', 'Charged fees or interest I didn\'t expect', 'Closing/Cancelling account', 'Communication tactics', 'Cont\'d attempts collect debt not owed', 'Convenience checks', 'Credit card protection / Debt protection', 'Credit decision / Underwriting', 'Credit determination', 'Credit line increase/decrease', 'Credit monitoring or identity protection', 'Credit reporting company\'s investigation', 'Customer service / Customer relations', 'Customer service/Customer relations', 'Dealing with my lender or servicer', 'Delinquent account', 'Deposits and withdrawals', 'Disclosure verification of debt', 'Disclosures', 'Excessive fees', 'False statements or representation', 'Fees', 'Forbearance / Workout plans', 'Fraud or scam', 'Getting a loan', 'Identity theft / Fraud / Embezzlement', 'Improper contact or sharing of info', 'Improper use of my credit report', 'Incorrect exchange rate', 'Incorrect information on credit report', 'Incorrect/missing disclosures or info', 'Late fee', 'Lender damaged or destroyed vehicle', 'Lender repossessed or sold the vehicle', 'Lender sold the property', 'Loan modification,collection,foreclosure', 'Loan servicing, payments, escrow account', 'Lost or stolen check', 'Lost or stolen money order', 'Making/receiving payments, sending money', 'Managing the line of credit', 'Managing the loan or lease', 'Managing, opening, or closing account', 'Money was not available when promised', 'Other', 'Other fee', 'Other service issues', 'Other transaction issues', 'Overdraft, savings or rewards features', 'Overlimit fee', 'Payment to acct not credited', 'Payoff process', 'Privacy', 'Problems caused by my funds being low', 'Problems when you are unable to pay', 'Received a loan I didn\'t apply for', 'Rewards', 'Sale of account', 'Settlement process and costs', 'Shopping for a line of credit', 'Shopping for a loan or lease', 'Taking out the loan or lease', 'Taking/threatening an illegal action', 'Transaction issue', 'Unable to get credit report/credit score', 'Unauthorized transactions/trans. issues', 'Unexpected/Other fees', 'Unsolicited issuance of credit card', 'Using a debit or ATM card', 'Wrong amount charged or received'"
'product',"'Bank account or service', 'Consumer Loan', 'Credit card', 'Credit reporting', 'Debt collection', 'Money transfers', 'Mortgage', 'Other financial service', 'Payday loan', 'Prepaid card', 'Student loan'"
'state',"'', 'AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'FM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'"
'sub_issue',"'', 'Account status', 'Account terms', 'Account terms and changes', 'Applied for loan/did not receive money', 'Attempted to collect wrong amount', 'Attempted to/Collected exempt funds', 'Billing dispute', 'Called after sent written cease of comm', 'Called outside of 8am-9pm', 'Can\'t contact lender', 'Can\'t decrease my monthly payments', 'Can\'t get flexible payment options', 'Can\'t qualify for a loan', 'Can\'t stop charges to bank account', 'Can\'t temporarily postpone payments', 'Charged bank acct wrong day or amt', 'Charged fees or interest I didn\'t expect', 'Contacted employer after asked not to', 'Contacted me after I asked not to', 'Contacted me instead of my attorney', 'Debt is not mine', 'Debt resulted from identity theft', 'Debt was discharged in bankruptcy', 'Debt was paid', 'Don\'t agree with fees charged', 'Frequent or repeated calls', 'Having problems with customer service', 'Impersonated an attorney or official', 'Inadequate help over the phone', 'Indicated committed crime not paying', 'Indicated shouldn\'t respond to lawsuit', 'Information is not mine', 'Investigation took too long', 'Keep getting calls about my loan', 'Need information about my balance/terms', 'No notice of investigation status/result', 'Not disclosed as an attempt to collect', 'Not given enough info to verify debt', 'Payment to acct not credited', 'Personal information', 'Problem cancelling or closing account', 'Problem getting my free annual report', 'Problem getting report or credit score', 'Problem with fraud alerts', 'Problem with statement of dispute', 'Public record', 'Qualify for a better loan than offered', 'Received a loan I didn\'t apply for', 'Received bad information about my loan', 'Received marketing offer after opted out', 'Receiving unwanted marketing/advertising', 'Reinserted previously deleted info', 'Report improperly shared by CRC', 'Report shared with employer w/o consent', 'Right to dispute notice not received', 'Seized/Attempted to seize property', 'Sued w/o proper notification of suit', 'Sued where didn\'t live/sign for debt', 'Talked to a third party about my debt', 'Threatened arrest/jail if do not pay', 'Threatened to sue on too old debt', 'Threatened to take legal action', 'Trouble with how payments are handled', 'Used obscene/profane/abusive language'"
'sub_product',"'', '(CD) Certificate of deposit', 'Auto', 'Cashing a check without an account', 'Check cashing', 'Checking account', 'Conventional adjustable mortgage (ARM)', 'Conventional fixed mortgage', 'Credit card', 'Credit repair', 'Debt settlement', 'Domestic (US) money transfer', 'Electronic Benefit Transfer / EBT card', 'FHA mortgage', 'Federal student loan', 'Foreign currency exchange', 'General purpose card', 'Gift or merchant card', 'Government benefit payment card', 'Home equity loan or line of credit', 'I do not know', 'ID prepaid card', 'Installment loan', 'International money transfer', 'Medical', 'Mobile wallet', 'Money order', 'Mortgage', 'Non-federal student loan', 'Other (i.e. phone, health club, etc.)', 'Other bank product/service', 'Other mortgage', 'Other special purpose card', 'Pawn loan', 'Payday loan', 'Payroll card', 'Personal line of credit', 'Refund anticipation check', 'Reverse mortgage', 'Savings account', 'Title loan', 'Transit card', 'Travelerâs/Cashierâs checks', 'VA mortgage', 'Vehicle lease', 'Vehicle loan'"
'timely_response',"'No', 'Yes'"


## All of the above is almost similar to how pandas_profiling works

## Compare your Train and Validation data together

In [17]:
df = pd.read_csv('data/consumer_complaints_with_narrative.csv')

In [22]:
train, val = train_test_split(df)

In [24]:
train_stats = tfdv.generate_statistics_from_dataframe(train)
val_stats = tfdv.generate_statistics_from_dataframe(val)

In [25]:
tfdv.visualize_statistics(lhs_statistics=val_stats, rhs_statistics=train_stats,
                              lhs_name='VAL_DATASET', rhs_name='TRAIN_DATASET')


## Anamoly Detection

In [29]:
anamolies = tfdv.validate_statistics(val_stats, schema=tfdv.infer_schema(train_stats))

In [30]:
tfdv.display_anomalies(anamolies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'state',Unexpected string values,Examples contain values missing from the schema: AS (<1%).
'issue',Unexpected string values,Examples contain values missing from the schema: Lost or stolen money order (<1%).


## Updating Schema

Required in the following sort of cases:
Taking the sub_issue feature discussed previously, if we decide that we need to require this fea‐ ture to be present in greater than 90% of our training examples, we can update the schema to reflect this.

## Identifying Data Skew and Drift

## Manually Check the Bias in your Datasets

## Slicing Data in TFDV

## Integrating TFDV into Your Machine Learning Pipeline