In [1]:
import logging
import os
import shutil
import pandas as pd
import urllib3

In [2]:
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"

In [3]:
# Initial local dataset location
LOCAL_FILE_NAME = "/content/drive/My Drive/MLOps/data/consumer_data/tmp_consumer_complaints_with_narrative.csv"

In [4]:
def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path
    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    # disable insecure https warning
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    c = urllib3.PoolManager()
    with c.request("GET", url, preload_content=False) as res, open(
        LOCAL_FILE_NAME, "wb"
    ) as out_file:
        shutil.copyfileobj(res, out_file)
    logging.info("Download completed.")

In [5]:
def update_csv():
    """update_csv updates the header row of the csv file, preprocesses
        the data and writes the entire file to a new file with the file name
        appendix "with_narrative.csv"
    Keyword Arguments:
        None
    Returns:
        None
    """

    file_name_part = os.path.splitext(LOCAL_FILE_NAME)[0]
    modified_file_name = file_name_part.replace("tmp_", "") + ".csv"

    feature_cols = [
        "product",
        "sub_product",
        "issue",
        "sub_issue",
        "consumer_complaint_narrative",
        "company",
        "state",
        "zip_code",
        "company_response",
        "timely_response",
        "consumer_disputed",
    ]
    df = pd.read_csv(LOCAL_FILE_NAME, usecols=feature_cols)

    df = df[df["consumer_complaint_narrative"].notnull()]
    df["c"] = df["consumer_disputed"].map({"Yes": 1, "No": 0})
    df = df.drop("consumer_disputed", axis=1)
    df = df.rename(columns={"c": "consumer_disputed"})
    df = df.sample(frac=1, replace=False).reset_index(drop=True)
    df["zip_code"] = df["zip_code"].str.replace("XX", "00")

    df.to_csv(modified_file_name, index=False)
    logging.info(f"CSV header updated and rewritten to {modified_file_name}")

In [6]:
if __name__ == "__main__":
  logging.basicConfig(level=logging.INFO)
  logging.info("Started")
  download_dataset()
  update_csv()
  os.remove(LOCAL_FILE_NAME)
  logging.info("Finished")

INFO:root:Started
INFO:urllib3.poolmanager:Redirecting http://bit.ly/building-ml-pipelines-dataset -> https://drive.google.com/uc?export=download&id=1VHjb8L8n2d6eLz_lA-F-bk6Z0UecHpEF
INFO:urllib3.poolmanager:Redirecting https://drive.google.com/uc?export=download&id=1VHjb8L8n2d6eLz_lA-F-bk6Z0UecHpEF -> https://doc-0o-8s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/jcbbce6b9hkrkp26hhegf03uq8fq6udd/1595592675000/06616860426990197454/*/1VHjb8L8n2d6eLz_lA-F-bk6Z0UecHpEF?e=download
INFO:root:Download completed.
INFO:root:CSV header updated and rewritten to /content/drive/My Drive/MLOps/data/consumer_data/consumer_complaints_with_narrative.csv
INFO:root:Finished
