In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")



# Load the training data
df = pd.read_csv("consumer_complaints_copy.csv")
df

  from pandas.core import (


Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,12-05-2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30005,,,Referral,12-12-2014,Untimely response,No,No,1144671
1,11-10-2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",DE,19803,,,Referral,11/19/2014,Untimely response,No,No,1109287
2,08/26/2015,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30014,,,Referral,09-08-2015,Untimely response,No,No,1536776
3,01/16/2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30087,,,Referral,02-11-2014,Untimely response,No,No,671539
4,06/25/2015,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,My mortgage company has misrepresented themsel...,,"1st 2nd Mortgage Company Of NJ, Inc.",NJ,074XX,,Consent provided,Web,07/22/2015,Closed,Yes,No,1437506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555952,01/26/2014,Debt collection,Non-federal student loan,Improper contact or sharing of info,Contacted employer after asked not to,,,Zwicker & Associates,MN,55428,,,Web,01/27/2014,Closed with non-monetary relief,Yes,No,685904
555953,01/26/2016,Debt collection,Non-federal student loan,Cont'd attempts collect debt not owed,Debt was discharged in bankruptcy,,,Zwicker & Associates,NJ,070XX,Older American,Consent provided,Web,02-10-2016,Closed with non-monetary relief,Yes,No,1759548
555954,03/31/2016,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not given enough info to verify debt,,,Zwicker & Associates,FL,33837,,,Referral,04-04-2016,Closed with explanation,Yes,No,1859430
555955,10/13/2015,Debt collection,Credit card,Disclosure verification of debt,Not given enough info to verify debt,,,Zwicker & Associates,FL,33308,,,Phone,10/13/2015,Closed with non-monetary relief,Yes,No,1603745


In [2]:
df.shape

(555957, 18)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555957 entries, 0 to 555956
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   date_received                 555957 non-null  object
 1   product                       555957 non-null  object
 2   sub_product                   397635 non-null  object
 3   issue                         555957 non-null  object
 4   sub_issue                     212622 non-null  object
 5   consumer_complaint_narrative  66806 non-null   object
 6   company_public_response       85124 non-null   object
 7   company                       555957 non-null  object
 8   state                         551070 non-null  object
 9   zipcode                       551452 non-null  object
 10  tags                          77959 non-null   object
 11  consumer_consent_provided     123458 non-null  object
 12  submitted_via                 555957 non-null  object
 13 

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Download required NLTK data
nltk.download('punkt')            #tokenization
nltk.download('stopwords')
nltk.download('wordnet')         # database for lemmatization



def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
      
        text = text.lower()  # Convert to lowercase
        tokens = nltk.word_tokenize(text)  # Tokenize
        
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
        
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
        return " ".join(tokens)
    else:
        return ""  # Return empty string for non-string inputs

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\razas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\razas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\razas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df['issue'] = df['issue'].astype(str).apply(preprocess_text)
df['sub_issue'] = df['sub_issue'].astype(str).apply(preprocess_text)


# Combine text columns
df['combined_text'] = df['issue'] + ' ' + df['sub_issue'] 



In [14]:
from gensim.models import Word2Vec
import gensim


# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['combined_text'], vector_size=100, window=5, min_count=1, workers=4)

# Convert text to vector representation
def get_vector(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['vectorized_text'] = df['combined_text'].apply(lambda x: get_vector(x, word2vec_model))

# Convert to feature matrix
X = np.vstack(df['vectorized_text'].values)
#y = df['product']  # Replace with actual target column


In [15]:
# Encode target labels
df["product_encoded"], product_labels = pd.factorize(df["product"])


df["sub_product"].fillna("Unknown", inplace=True)  # Replace NaNs with "Unknown"
df["sub_product_encoded"], sub_product_labels = pd.factorize(df["sub_product"])


In [16]:
from sklearn.model_selection import train_test_split

# Split data for product and sub-product classification
X_train, X_test, y_train_product, y_test_product, y_train_sub_product, y_test_sub_product = train_test_split(
    X, df["product_encoded"],df["sub_product_encoded"], test_size=0.2, random_state=42
)



In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train Random Forest for Product Prediction
product_model = RandomForestClassifier(n_estimators=100, random_state=42)
product_model.fit(X_train, y_train_product)

# Train Random Forest for Sub-Product Prediction
sub_product_model = RandomForestClassifier(n_estimators=100, random_state=42)
sub_product_model.fit(X_train, y_train_sub_product)

# Predict and Evaluate Product Model
y_pred_product = product_model.predict(X_test)
accuracy_product = accuracy_score(y_test_product, y_pred_product)
print(f"Product Classification Accuracy: {accuracy_product:.4f}")

# Classification Report for Product
print("\nProduct Classification Report:")
print(classification_report(y_test_product, y_pred_product))

# Predict and Evaluate Sub-Product Model
y_pred_sub_product = sub_product_model.predict(X_test)
accuracy_sub_product = accuracy_score(y_test_sub_product, y_pred_sub_product)
print(f"Sub-Product Classification Accuracy: {accuracy_sub_product:.4f}")

# Classification Report for Sub-Product
print("\nSub-Product Classification Report:")
print(classification_report(y_test_sub_product, y_pred_sub_product))


Product Classification Accuracy: 0.9890

Product Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20200
           1       1.00      0.98      0.99     37377
           2       1.00      1.00      1.00      4097
           3       1.00      1.00      1.00     12541
           4       1.00      1.00      1.00     18358
           5       1.00      1.00      1.00       851
           6       1.00      0.71      0.83       117
           7       1.00      1.00      1.00      3128
           8       0.81      0.73      0.77       785
           9       1.00      0.75      0.86       495
          10       0.92      1.00      0.96     13243

    accuracy                           0.99    111192
   macro avg       0.98      0.92      0.95    111192
weighted avg       0.99      0.99      0.99    111192

Sub-Product Classification Accuracy: 0.6317

Sub-Product Classification Report:
              precision    recall  