In [371]:
import pandas as pd
import numpy as np

In [372]:
df = pd.read_csv("data/raw/complaints_2018-01-01_2024-09-27_BOA.csv")

In [373]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73845 entries, 0 to 73844
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Date received                 73845 non-null  object 
 1   Product                       73845 non-null  object 
 2   Sub-product                   73837 non-null  object 
 3   Issue                         73844 non-null  object 
 4   Sub-issue                     62803 non-null  object 
 5   Consumer complaint narrative  33681 non-null  object 
 6   Company public response       71777 non-null  object 
 7   Company                       73845 non-null  object 
 8   State                         71926 non-null  object 
 9   ZIP code                      72474 non-null  object 
 10  Tags                          10897 non-null  object 
 11  Consumer consent provided?    54177 non-null  object 
 12  Submitted via                 73845 non-null  object 
 13  D

### 0 Drop columns with 100% missing values

In [374]:
df.drop(columns=['Consumer disputed?'], inplace=True)

### 1 Change column names

In [375]:
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('?', '', regex=False)
df.columns = df.columns.str.replace('-', '_')

### 2 Change data type

In [376]:
df['date_received'] = pd.to_datetime(df.date_received)
df['date_sent_to_company'] = pd.to_datetime(df['date_sent_to_company'])

  df['date_received'] = pd.to_datetime(df.date_received)
  df['date_sent_to_company'] = pd.to_datetime(df['date_sent_to_company'])


In [377]:
df.dtypes

date_received                   datetime64[ns]
product                                 object
sub_product                             object
issue                                   object
sub_issue                               object
consumer_complaint_narrative            object
company_public_response                 object
company                                 object
state                                   object
zip_code                                object
tags                                    object
consumer_consent_provided               object
submitted_via                           object
date_sent_to_company            datetime64[ns]
company_response_to_consumer            object
timely_response                         object
complaint_id                             int64
dtype: object

### 3 Impute missing values

In [378]:
df.isna().sum()

date_received                       0
product                             0
sub_product                         8
issue                               1
sub_issue                       11042
consumer_complaint_narrative    40164
company_public_response          2068
company                             0
state                            1919
zip_code                         1371
tags                            62948
consumer_consent_provided       19668
submitted_via                       0
date_sent_to_company                0
company_response_to_consumer        0
timely_response                     0
complaint_id                        0
dtype: int64

In [379]:
df.fillna({
    'sub_product': 'Unknown',
    'sub_issue': 'Unknown',
    'consumer_complaint_narrative': 'No Narrative',
    'company_public_response': 'Unknown',
    'state': 'Unknown',
    'zip_code': 'Unknown',
    'tags': 'No Tags',
    'consumer_consent_provided': 'No Consent Provided',
    'consumer_disputed': 'Unknown'
}, inplace=True)

In [380]:
df.isna().sum()

date_received                   0
product                         0
sub_product                     0
issue                           1
sub_issue                       0
consumer_complaint_narrative    0
company_public_response         0
company                         0
state                           0
zip_code                        0
tags                            0
consumer_consent_provided       0
submitted_via                   0
date_sent_to_company            0
company_response_to_consumer    0
timely_response                 0
complaint_id                    0
dtype: int64

### 4 Feature Engineering

In [381]:
df['year_month'] = df['date_received'].dt.strftime('%Y-%m')

In [382]:
df['timely_response'].value_counts(normalize=True)

timely_response
Yes    0.965414
No     0.034586
Name: proportion, dtype: float64

In [383]:
df['timely_response'] = df['timely_response'].map({'Yes': 1, 'No': 0})

In [384]:
# we don't need 'company' column for this analysis because we are only dealing with BOA dataset
df.drop(columns=['company'], axis=1, inplace=True)

### 5 Extract the census data

In [385]:
df_census = pd.read_csv("data/raw/census_population_by_state.csv")

In [386]:
df_census.head()

Unnamed: 0,State,2020,2021,2022,2023
0,.Alabama,5031864,5050380,5073903,5108468
1,.Alaska,732964,734923,733276,733406
2,.Arizona,7186683,7272487,7365684,7431344
3,.Arkansas,3014348,3028443,3046404,3067732
4,.California,39503200,39145060,39040616,38965193


In [387]:
df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   State   51 non-null     object
 1   2020    51 non-null     object
 2   2021    51 non-null     object
 3   2022    51 non-null     object
 4   2023    51 non-null     object
dtypes: object(5)
memory usage: 2.1+ KB


In [388]:
# convert columns names to integers

df_census.columns = [int(col) if col.isdigit() else col for col in df_census.columns]

# remove the "." in front of the states name

df_census["State"] = df_census["State"].str.lstrip(".")

# convert data types for popluation values to int

for col in range(2020, 2024):
    df_census[col] = df_census[col].str.replace(',', '')  # Remove commas
    df_census[col] = df_census[col].astype(int)

df_census.head()

Unnamed: 0,State,2020,2021,2022,2023
0,Alabama,5031864,5050380,5073903,5108468
1,Alaska,732964,734923,733276,733406
2,Arizona,7186683,7272487,7365684,7431344
3,Arkansas,3014348,3028443,3046404,3067732
4,California,39503200,39145060,39040616,38965193


In [389]:
# convert the table to long format to integrate year into one column
df_census_long = pd.melt(df_census, id_vars=['State'], value_vars=[2020, 2021, 2022, 2023],
                  var_name='Year', value_name='Population')

In [390]:
df_census_long['State'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [391]:
state_to_abbreviation = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'District of Columbia': 'DC'
}

In [392]:
# Replace the 'State' column with the abbreviations in place
df_census_long['State'] = df_census_long['State'].map(state_to_abbreviation)
df_census_long.head()

Unnamed: 0,State,Year,Population
0,AL,2020,5031864
1,AK,2020,732964
2,AZ,2020,7186683
3,AR,2020,3014348
4,CA,2020,39503200


In [393]:
# change all column names to lower cases
df_census_long.columns = [x.lower() for x in df_census_long.columns]

### 6 Save results

In [394]:
df.columns

Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'state',
       'zip_code', 'tags', 'consumer_consent_provided', 'submitted_via',
       'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'complaint_id', 'year_month'],
      dtype='object')

In [395]:
df_census_long.columns

Index(['state', 'year', 'population'], dtype='object')

In [396]:
df.to_csv("data/processed/complaints_transformed_2.csv", index=False)

In [397]:
df_census_long.to_csv("data/processed/census_population_by_state.csv", index=False)