In [6]:
"""
Downloads the csv data
"""

import logging
import os
import shutil

import pandas as pd
import urllib3

# Initial dataset source
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"

# Initial local dataset location
LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv"


def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path
    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    # disable insecure https warning
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    c = urllib3.PoolManager()
    with c.request("GET", url, preload_content=False) as res, open(
        LOCAL_FILE_NAME, "wb"
    ) as out_file:
        shutil.copyfileobj(res, out_file)
    logging.info("Download completed.")


def create_folder():
    """Creates a data folder if it doesn't exist.
    Returns:
        None
    """
    directory = "data/"
    if not os.path.exists(directory):
        os.makedirs(directory)
        logging.info("Data folder created.")
    else:
        logging.info("Data folder already existed.")


def check_execution_path():
    """Check if the function and therefore all subsequent functions
        are executed from the root of the project
    Returns:
        boolean -- returns False if execution path isn't the root,
            otherwise True
    """
    file_name = "LICENSE"
    if not os.path.exists(file_name):
        logging.error(
            "Don't execute the script from a sub-directory. "
            "Switch to the root of the project folder"
        )
        return False
    return True


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)
    logging.info("Started download script")
    create_folder()
    download_dataset()

    logging.info("Finished download script")

Crearte a tf.record to be ingested

In [23]:
import tensorflow as tf
import pandas as pd
import tqdm
import csv

In [10]:
cdata = pd.read_csv('data/consumer_complaints_with_narrative.csv')

In [14]:
cdata.head()

Unnamed: 0,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company,state,zip_code,company_response,timely_response,consumer_disputed
0,Debt collection,I do not know,Disclosure verification of debt,Right to dispute notice not received,I was denied employment because of a judgment ...,Encore Capital Group,NY,113XX,Closed with explanation,Yes,0
1,Credit reporting,,Improper use of my credit report,Report improperly shared by CRC,I have a credit card through XXXX XXXX and XXX...,Experian,IL,606XX,Closed with non-monetary relief,Yes,0
2,Debt collection,I do not know,Cont'd attempts collect debt not owed,Debt is not mine,Almost daily phone calls from Stellar Recovery...,Stellar Recovery Inc.,MI,480XX,Closed with explanation,Yes,1
3,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,I submitted my monthly mortgage payment to Pri...,Primary Residential Mortgage,CT,066XX,Closed with monetary relief,Yes,0
4,Student loan,Non-federal student loan,Dealing with my lender or servicer,Received bad information about my loan,I contacted America Education Services in XX/X...,AES/PHEAA,FL,321XX,Closed with explanation,Yes,1


## Helper code to convert csv to tfreacord

In [26]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def clean_rows(row):
    if not row["zip_code"]:
        row["zip_code"] = "99999"
    return row


def convert_zipcode_to_int(zipcode):
    if isinstance(zipcode, str) and "XX" in zipcode:
        zipcode = zipcode.replace("XX", "00")
    int_zipcode = int(zipcode)
    return int_zipcode

In [27]:
original_data_file = "data/consumer_complaints_with_narrative.csv"

tfrecords_filename = "consumer-complaints.tfrecords"
tf_record_writer = tf.io.TFRecordWriter(tfrecords_filename)

with open(original_data_file) as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')
    for row in reader:
        row = clean_rows(row)
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "product": _bytes_feature(row["product"]),
                    "sub_product": _bytes_feature(row["sub_product"]),
                    "issue": _bytes_feature(row["issue"]),
                    "sub_issue": _bytes_feature(row["sub_issue"]),
                    "state": _bytes_feature(row["state"]),
                    "zip_code": _int64_feature(convert_zipcode_to_int(row["zip_code"])),
                    "company": _bytes_feature(row["company"]),
                    "company_response": _bytes_feature(row["company_response"]),
                    "timely_response": _bytes_feature(row["timely_response"]),
                    "consumer_disputed": _bytes_feature(row["consumer_disputed"]),
                }
            )
        )
        tf_record_writer.write(example.SerializeToString())
    tf_record_writer.close()

In [29]:
# A different beast to learn this
dataset = tf.data.TFRecordDataset(['consumer-complaints.tfrecords'])


In [31]:
dataset.element_spec

TensorSpec(shape=(), dtype=tf.string, name=None)