In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import re

# 1. Load the Consumer Complaints dataset
consumer_complaints_path = "/content/drive/MyDrive/Project2/raw data/consumer_complaints.csv"
df_consumer = pd.read_csv(consumer_complaints_path, low_memory=False)

In [4]:
# 2. Explore the initial dataset structure
print("Consumer Complaints Dataset Shape:", df_consumer.shape)
print("Columns:", df_consumer.columns.tolist())
print("Sample Rows:")
print(df_consumer.head())

# 3. Fill missing values (example: fill missing 'tags' with 'none')
if "tags" in df_consumer.columns:
    df_consumer["tags"] = df_consumer["tags"].fillna("none")

# 4. Standardize string columns
# Example: convert 'company' to lowercase, 'state' to uppercase
if "company" in df_consumer.columns:
    df_consumer["company"] = df_consumer["company"].astype(str).str.strip().str.lower()
if "state" in df_consumer.columns:
    df_consumer["state"] = df_consumer["state"].astype(str).str.strip().str.upper()

# 5. Convert date columns to datetime format
date_columns = ["date_received", "date_sent_to_company"]
for col in date_columns:
    if col in df_consumer.columns:
        df_consumer[col] = pd.to_datetime(df_consumer[col], errors="coerce")

# 6. Handle ZIP codes
# If you want to keep them as strings (since some ZIP codes can start with 0), do:
if "zipcode" in df_consumer.columns:
    df_consumer["zipcode"] = df_consumer["zipcode"].fillna("unknown").astype(str).str.strip()

# 7. Numeric columns (e.g., complaint_id)
# Convert to numeric, if not already
if "complaint_id" in df_consumer.columns:
    df_consumer["complaint_id"] = pd.to_numeric(df_consumer["complaint_id"], errors="coerce")

# 8. Check results after preprocessing
print("\nAfter Preprocessing:")
print(df_consumer.head())


Consumer Complaints Dataset Shape: (555957, 18)
Columns: ['date_received', 'product', 'sub_product', 'issue', 'sub_issue', 'consumer_complaint_narrative', 'company_public_response', 'company', 'state', 'zipcode', 'tags', 'consumer_consent_provided', 'submitted_via', 'date_sent_to_company', 'company_response_to_consumer', 'timely_response', 'consumer_disputed?', 'complaint_id']
Sample Rows:
  date_received           product               sub_product  \
0    08/30/2013          Mortgage            Other mortgage   
1    08/30/2013          Mortgage            Other mortgage   
2    08/30/2013  Credit reporting                       NaN   
3    08/30/2013      Student loan  Non-federal student loan   
4    08/30/2013   Debt collection               Credit card   

                                      issue  \
0  Loan modification,collection,foreclosure   
1  Loan servicing, payments, escrow account   
2    Incorrect information on credit report   
3                        Repaying your l

In [8]:
# 9. (Optional) Save the cleaned dataset (sampled) for later use

# Sample 30% of the consumer complaints dataset
df_consumer_sampled = df_consumer.sample(frac=0.3, random_state=42)

# Define the output path with a .gz extension
cleaned_consumer_sampled_path = "/content/drive/MyDrive/Project2/clean_data/consumer_complaints_cleaned_sampled.csv.gz"

# Save the sampled dataset with gzip compression
df_consumer_sampled.to_csv(cleaned_consumer_sampled_path, index=False, compression="gzip")
print(f"\nCleaned Consumer Complaints dataset (sampled) saved with gzip compression to: {cleaned_consumer_sampled_path}")



Cleaned Consumer Complaints dataset (sampled) saved with gzip compression to: /content/drive/MyDrive/Project2/clean_data/consumer_complaints_cleaned_sampled.csv.gz
