<a href="https://colab.research.google.com/github/SpandanaKalakonda/Building_ML_Pipelines/blob/main/converting_csvdata_to_tfrecord.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install tfx

In [None]:
!git clone https://github.com/Building-ML-Pipelines/building-machine-learning-pipelines.git

In [None]:
%cd building-machine-learning-pipelines/
!python3 utils/download_dataset.py
%cd ..

In [49]:
import pandas as pd

In [33]:
import os
import csv
import tensorflow as tf
from tqdm import tqdm

In [50]:
df = pd.read_csv("/content/building-machine-learning-pipelines/data/consumer_complaints_with_narrative.csv")

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66799 entries, 0 to 66798
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   product                       66799 non-null  object
 1   sub_product                   46347 non-null  object
 2   issue                         66799 non-null  object
 3   sub_issue                     32931 non-null  object
 4   consumer_complaint_narrative  66799 non-null  object
 5   company                       66799 non-null  object
 6   state                         66613 non-null  object
 7   zip_code                      66610 non-null  object
 8   company_response              66799 non-null  object
 9   timely_response               66799 non-null  object
 10  consumer_disputed             66799 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 5.6+ MB


In [58]:
df['product'].value_counts()

Debt collection            17552
Mortgage                   14917
Credit reporting           12525
Credit card                 7927
Bank account or service     5710
Consumer Loan               3677
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: product, dtype: int64

In [59]:
df['sub_product'].value_counts()

Conventional fixed mortgage               6638
Other (i.e. phone, health club, etc.)     5253
Checking account                          3819
I do not know                             3795
Credit card                               3517
FHA mortgage                              2776
Medical                                   2725
Non-federal student loan                  2456
Conventional adjustable mortgage (ARM)    2378
Vehicle loan                              2064
Payday loan                               1658
Other mortgage                            1410
Other bank product/service                1353
Installment loan                          1182
Home equity loan or line of credit         947
VA mortgage                                639
General purpose card                       439
Auto                                       399
Domestic (US) money transfer               379
Mortgage                                   366
Savings account                            363
Vehicle lease

In [60]:
df['issue'].value_counts()

Incorrect information on credit report      8351
Cont'd attempts collect debt not owed       7520
Loan servicing, payments, escrow account    6553
Loan modification,collection,foreclosure    5162
Disclosure verification of debt             2912
                                            ... 
Shopping for a line of credit                  4
Lost or stolen money order                     4
Lender damaged or destroyed vehicle            1
Incorrect exchange rate                        1
Lender sold the property                       1
Name: issue, Length: 90, dtype: int64

In [61]:
df['sub_issue'].value_counts()

Debt is not mine                            4558
Information is not mine                     2660
Account status                              2255
Debt was paid                               1960
Not given enough info to verify debt        1645
                                            ... 
Contacted me instead of my attorney           32
Receiving unwanted marketing/advertising      29
Report shared with employer w/o consent       28
Qualify for a better loan than offered        22
Received marketing offer after opted out      21
Name: sub_issue, Length: 64, dtype: int64

In [63]:
df['consumer_complaint_narrative'][:5]

0    I was denied employment because of a judgment ...
1    I have a credit card through XXXX XXXX and XXX...
2    Almost daily phone calls from Stellar Recovery...
3    I submitted my monthly mortgage payment to Pri...
4    I contacted America Education Services in XX/X...
Name: consumer_complaint_narrative, dtype: object

In [65]:
df['company'].value_counts()

Equifax                                   4195
Experian                                  3933
TransUnion Intermediate Holdings, Inc.    3863
Bank of America                           3497
Wells Fargo & Company                     3074
                                          ... 
Bristlecone, Inc.                            1
DAS Acquisition Company, LLC                 1
Skopos Financial, LLC                        1
Locate Services LLC                          1
JBS Finance, Inc.                            1
Name: company, Length: 2296, dtype: int64

In [69]:
df['state'].value_counts()[:5]

CA    9981
FL    5956
TX    5700
NY    3817
GA    3111
Name: state, dtype: int64

In [71]:
df['company_response'].value_counts()

Closed with explanation            50921
Closed with non-monetary relief     8607
Closed with monetary relief         4974
Closed                              1766
Untimely response                    531
Name: company_response, dtype: int64

In [72]:
df['timely_response'].value_counts()

Yes    64631
No      2168
Name: timely_response, dtype: int64

In [56]:
df['consumer_disputed'].value_counts()

0    51224
1    15575
Name: consumer_disputed, dtype: int64

In [52]:
df["zip_code"].value_counts()

300XX    1026
750XX     792
770XX     747
900XX     719
945XX     703
         ... 
893XX       1
205XX       1
792XX       1
556XX       1
584XX       1
Name: zip_code, Length: 904, dtype: int64

In [44]:
def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

In [35]:
def _float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [54]:
def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [40]:
def clean_rows(row):
  if not row["zip_code"]:
    row["zip_code"] = "99999"
  return row

In [47]:
def convert_zipcode_to_int(zipcode):
  if isinstance(zipcode, str) and "XX" in zipcode:
    zipcode = zipcode.replace("XX", "00")
  int_zipcode = int(zipcode)
  return int_zipcode

In [42]:
original_data_file = "/content/building-machine-learning-pipelines/data/consumer_complaints_with_narrative.csv"
tfrecord_filename = "consumer_complaints.tfrecord"
tf_record_writer = tf.io.TFRecordWriter(tfrecord_filename)

In [55]:
with open(original_data_file) as csv_file:
  reader = csv.DictReader(csv_file, delimiter=",", quotechar='"')
  for row in tqdm(reader):
    row = clean_rows(row)
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                "product": _bytes_feature(row["product"]),
                "sub_product": _bytes_feature(row["sub_product"]),
                "issue": _bytes_feature(row["issue"]),
                "sub_issue": _bytes_feature(row["sub_issue"]),
                "state": _bytes_feature(row["state"]),
                "zip_code": _int64_feature(convert_zipcode_to_int(row["zip_code"])),
                "company": _bytes_feature(row["company"]),
                "company_response": _bytes_feature(row["company_response"]),
                "consumer_complaint_narrative": _bytes_feature(row["consumer_complaint_narrative"]),
                "timely_response": _bytes_feature(row["timely_response"]),
                "consumer_disputed": _bytes_feature(row["consumer_disputed"]),
            }
        )
        )
    tf_record_writer.write(example.SerializeToString())
  tf_record_writer.close()

66799it [00:11, 5827.17it/s]
