In [2]:
import csv
import tensorflow as tf
from tqdm import tqdm

In [3]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

In [4]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [5]:
def clean_rows(row):
    if not row["zip_code"]:
        row["zip_code"] = "99999"
    return row

In [6]:
def convert_zipcode_to_int(zipcode):
    if isinstance(zipcode, str) and "XX" in zipcode:
        zipcode = zipcode.replace("XX", "00")
    int_zipcode = int(zipcode)
    return int_zipcode

In [9]:
original_data_file = "/home/lunet/ttrmc/mlops/building-machine-learning-pipelines/building-machine-learning-pipelines-examples_based_on_tfx_1.4/data/consumer_complaints_with_narrative.csv"
tfrecords_filename = "consumer-complaints.tfrecords"
tf_record_writer = tf.io.TFRecordWriter(tfrecords_filename)

In [10]:
with open(original_data_file) as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')
    for row in tqdm(reader):
        row = clean_rows(row)
        example = tf.train.Example(
            features=tf.train.Features(
                feature={
                    "product": _bytes_feature(row["product"]),
                    "sub_product": _bytes_feature(row["sub_product"]),
                    "issue": _bytes_feature(row["issue"]),
                    "sub_issue": _bytes_feature(row["sub_issue"]),
                    "state": _bytes_feature(row["state"]),
                    "zip_code": _int64_feature(convert_zipcode_to_int(row["zip_code"])),
                    "company": _bytes_feature(row["company"]),
                    "company_response": _bytes_feature(row["company_response"]),
                    "timely_response": _bytes_feature(row["timely_response"]),
                    "consumer_disputed": _bytes_feature(row["consumer_disputed"]),
                }
            )
        )
        tf_record_writer.write(example.SerializeToString())
    tf_record_writer.close()

66799it [00:05, 12678.03it/s]
