In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import nltk
import re
from nltk.corpus import stopwords
import string

In [None]:
df = pd.read_parquet("/content/drive/MyDrive/Datasets/PreProcessedData.parquet")

In [None]:
df.head()

Unnamed: 0,Date_Received,Product,Sub_Product,Issue,Sub_Issue,Complaint
0,2022-11-15,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,I started the dispute process with this compan...
1,2022-08-19,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Investigation took more than 30 days,"This is NOT A DUPLICATE and I, XXXX XXXX XXXX ..."
2,2022-11-03,Vehicle loan or lease,Loan,Struggling to pay your loan,Lender trying to repossess or disable the vehicle,i made a pyment arrangement several time with ...
3,2022-09-07,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,Earlier this year I made the decision to take ...
4,2022-10-27,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,Citibank offered a personal and unsecured loan...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775522 entries, 0 to 775521
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Date_Received  775522 non-null  object
 1   Product        775522 non-null  object
 2   Sub_Product    775522 non-null  object
 3   Issue          775522 non-null  object
 4   Sub_Issue      775522 non-null  object
 5   Complaint      775522 non-null  object
dtypes: object(6)
memory usage: 35.5+ MB


# Text Processing

In [None]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

df["Complaint"] = df["Complaint"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Save the Text Processed data

In [None]:
df.to_parquet("/content/drive/MyDrive/Datasets/textProcessedData.parquet")

# Load the Text processed data

In [2]:
df = pd.read_parquet("/content/drive/MyDrive/Datasets/textProcessedData.parquet", columns=["Product", "Complaint"])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775522 entries, 0 to 775521
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Product    775522 non-null  object
 1   Complaint  775522 non-null  object
dtypes: object(2)
memory usage: 11.8+ MB


In [None]:
df.head()

Unnamed: 0,Product,Complaint
0,"Credit reporting, credit repair services, or o...",start disput process compani inaccur inform re...
1,"Credit reporting, credit repair services, or o...",duplic xxxx xxxx xxxx social secur xxxx xxxx f...
2,Vehicle loan or lease,made pyment arrang sever time repres car still...
3,"Credit reporting, credit repair services, or o...",earlier year made decis take look report later...
4,"Credit reporting, credit repair services, or o...",citibank offer person unsecur loan xxxxxxxx m...


# Train Test Split

In [3]:
x = np.array(df['Complaint'])
y = np.array(df['Product'])

cv = CountVectorizer()
X = cv.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Training

In [13]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train, y_train)

In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)

Enter a Text: Investigation took more than 30 days and nothing was changed when clearly there are misleading, incorrect, inaccurate items on my credit report..i have those two accounts attached showing those inaccuracies... I need them to follow the law because this is a violation of my rights!! The EVIDENCE IS IN BLACK AND WHITE ....
['Credit reporting, credit repair services, or other personal consumer reports']


# Model Evaluation


In [14]:
y_pred = sgdmodel.predict(X_test)

# Accuracy

In [15]:
accuracy = accuracy_score(y_test, y_pred)

In [16]:
accuracy

0.8537333494840245

# Precision

In [None]:
macro_averaged_precision = metrics.precision_score(y_test, y_pred, average = 'macro')
print(f"Macro-Averaged Precision score: {macro_averaged_precision}")


Macro-Averaged Precision score: 0.7530859532794468


In [None]:
micro_averaged_precision = metrics.precision_score(y_test, y_pred, average = 'micro')
print(f"Micro-Averaged Precision score: {micro_averaged_precision}")

Micro-Averaged Precision score: 0.8542842964485412


# Recall

In [None]:
macro_averaged_recall = metrics.recall_score(y_test, y_pred, average = 'macro')
print(f"Macro-averaged recall score using sklearn : {macro_averaged_recall}")

Macro-averaged recall score using sklearn : 0.6625029408439762


In [None]:
micro_averaged_recall = metrics.recall_score(y_test, y_pred, average = 'micro')
print(f"Micro-Averaged recall score using sklearn library : {micro_averaged_recall}")

Micro-Averaged recall score using sklearn library : 0.8542842964485412


# F1 Score

In [None]:
macro_averaged_f1 = metrics.f1_score(y_test, y_pred, average = 'macro')
print(f"Macro-Averaged F1 score using sklearn library : {macro_averaged_f1}")

Macro-Averaged F1 score using sklearn library : 0.6884643860467197


In [None]:
micro_averaged_f1 = metrics.f1_score(y_test, y_pred, average = 'micro')
print(f"Micro-Averaged F1 score using sklearn library : {micro_averaged_f1}")

Micro-Averaged F1 score using sklearn library : 0.8542842964485412


# Classification Report

In [None]:
cr = classification_report(y_test, y_pred)
print(cr)

                                                                              precision    recall  f1-score   support

                                                 Checking or savings account       0.86      0.84      0.85     17921
                                                 Credit card or prepaid card       0.83      0.74      0.78     26406
Credit reporting, credit repair services, or other personal consumer reports       0.88      0.92      0.90    132805
                                                             Debt collection       0.81      0.83      0.82     61230
                                   Payday loan, title loan, or personal loan       0.33      0.01      0.03       339
                                                                Student loan       0.91      0.76      0.82     10695
                                                       Vehicle loan or lease       0.81      0.48      0.60      6527

                                                      