# Explanatory Data Analysis and Feature Engineering

In [1]:
import pandas as pd

data = pd.read_csv('Consumer_Complaints.csv')

print(data.head())

print(data['Product'].value_counts())


  Date received                                            Product  \
0    2023-08-25  Credit reporting or other personal consumer re...   
1    2023-08-25  Credit reporting or other personal consumer re...   
2    2023-08-23  Credit reporting, credit repair services, or o...   
3    2023-08-23  Credit reporting, credit repair services, or o...   
4    2023-08-23  Credit reporting, credit repair services, or o...   

        Sub-product                                              Issue  \
0  Credit reporting               Incorrect information on your report   
1  Credit reporting                        Improper use of your report   
2  Credit reporting  Problem with a credit reporting company's inve...   
3  Credit reporting  Problem with a credit reporting company's inve...   
4  Credit reporting  Problem with a credit reporting company's inve...   

                                           Sub-issue  \
0                Information belongs to someone else   
1  Credit inquiries on

In [2]:
category_map = {
    'Credit reporting, credit repair services, or other personal consumer reports': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3,
}

data['Category'] = data['Product'].map(category_map)

data = data[data['Category'].notna()]


# Text Pre-Processing

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/liraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/liraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    
    words = word_tokenize(text)
    
    words = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
    
    text = ' '.join(words)
    
    return text
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].fillna('')
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(preprocess_text)


# Selection of Multi Classification model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(data['Consumer complaint narrative'], data['Category'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train)

X_test = vectorizer.transform(X_test)

model = MultinomialNB()

model.fit(X_train, y_train)


 # Comparison of model performance

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

model_lr = LogisticRegression()
model_svc = LinearSVC()

model_lr.fit(X_train, y_train)
model_svc.fit(X_train, y_train)

print('MultinomialNB:', model.score(X_test, y_test))
print('LogisticRegression:', model_lr.score(X_test, y_test))
print('LinearSVC:', model_svc.score(X_test, y_test))


# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)


 # Prediction

In [None]:
complaint = 'I\'ve been wanting to buy a new car, but the bank is not assisting me with the loan" 

complaint = preprocess_text(complaint)

complaint = vectorizer.transform([complaint])

category = model.predict(complaint)

print('Category:', category)
