In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam


# 1. Load your data
df = pd.read_csv("consumer_complaints_copy.csv")  # replace with actual path if needed



  from pandas.core import (


In [2]:
df

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,12-05-2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30005,,,Referral,12-12-2014,Untimely response,No,No,1144671
1,11-10-2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",DE,19803,,,Referral,11/19/2014,Untimely response,No,No,1109287
2,08/26/2015,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30014,,,Referral,09-08-2015,Untimely response,No,No,1536776
3,01/16/2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30087,,,Referral,02-11-2014,Untimely response,No,No,671539
4,06/25/2015,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,My mortgage company has misrepresented themsel...,,"1st 2nd Mortgage Company Of NJ, Inc.",NJ,074XX,,Consent provided,Web,07/22/2015,Closed,Yes,No,1437506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555952,01/26/2014,Debt collection,Non-federal student loan,Improper contact or sharing of info,Contacted employer after asked not to,,,Zwicker & Associates,MN,55428,,,Web,01/27/2014,Closed with non-monetary relief,Yes,No,685904
555953,01/26/2016,Debt collection,Non-federal student loan,Cont'd attempts collect debt not owed,Debt was discharged in bankruptcy,,,Zwicker & Associates,NJ,070XX,Older American,Consent provided,Web,02-10-2016,Closed with non-monetary relief,Yes,No,1759548
555954,03/31/2016,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not given enough info to verify debt,,,Zwicker & Associates,FL,33837,,,Referral,04-04-2016,Closed with explanation,Yes,No,1859430
555955,10/13/2015,Debt collection,Credit card,Disclosure verification of debt,Not given enough info to verify debt,,,Zwicker & Associates,FL,33308,,,Phone,10/13/2015,Closed with non-monetary relief,Yes,No,1603745


In [3]:
# 2. Preprocess text
df["sub_issue"] = df["sub_issue"].fillna("")  # handle NaNs
df["text"] = df["issue"] + " " + df["sub_issue"]

# 3. Encode targets
product_encoder = LabelEncoder()
subproduct_encoder = LabelEncoder()
df["Product Encoded"] = product_encoder.fit_transform(df["product"])
df["SubProduct Encoded"] = subproduct_encoder.fit_transform(df["sub_product"])



In [4]:
# 4. Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded = pad_sequences(sequences, maxlen=100)

# 5. One-hot encode targets
product_labels = to_categorical(df["Product Encoded"])
subproduct_labels = to_categorical(df["SubProduct Encoded"])



In [5]:
# 6. Train/test split
X_train, X_test, y_train_prod, y_test_prod, y_train_sub, y_test_sub = train_test_split(
    padded, product_labels, subproduct_labels, test_size=0.2, random_state=42
)

# 7. Build the model
input_layer = Input(shape=(100,))
x = Embedding(input_dim=10000, output_dim=64)(input_layer)
x = LSTM(64, return_sequences=False)(x)
x = Dropout(0.5)(x)

# Output 1 - Product Category
output1 = Dense(product_labels.shape[1], activation='softmax', name="product_output")(x)

# Output 2 - Sub-Product Category
output2 = Dense(subproduct_labels.shape[1], activation='softmax', name="subproduct_output")(x)

model = Model(inputs=input_layer, outputs=[output1, output2])



In [6]:
# 8. Compile
model.compile(optimizer=Adam(learning_rate=0.001),
              loss={'product_output': 'categorical_crossentropy',
                    'subproduct_output': 'categorical_crossentropy'},
              metrics={'product_output': 'accuracy',
                       'subproduct_output': 'accuracy'})

# 9. Train
history = model.fit(
    X_train,
    {'product_output': y_train_prod, 'subproduct_output': y_train_sub},
    validation_split=0.2,
    epochs=10,
    batch_size=32
)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# 10. Evaluate
loss, prod_loss, subprod_loss, prod_acc, subprod_acc = model.evaluate(
    X_test, {'product_output': y_test_prod, 'subproduct_output': y_test_sub}
)

print(f"Product Accuracy: {prod_acc:.2f}")
print(f"Sub-Product Accuracy: {subprod_acc:.2f}")


Product Accuracy: 0.99
Sub-Product Accuracy: 0.63
