In [1]:
import pandas as pd

df = pd.read_csv("E:\\1JOB\\Kaiburr\\ML\\complaints.csv\\Complaints_SpacyProcessed.csv", usecols= ["Category","Consumer complaint narrative"])

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357276 entries, 0 to 357275
Data columns (total 2 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Category                      357276 non-null  int64 
 1   Consumer complaint narrative  357276 non-null  object
dtypes: int64(1), object(1)
memory usage: 5.5+ MB


In [3]:
df.isnull().sum()

Category                        0
Consumer complaint narrative    0
dtype: int64

# Pre processing

In [4]:
import spacy
import re

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Compile the regular expression pattern
pattern = re.compile(r'(XX/XX/\d{4}|XXXX|XX|\W|\d)+')

class CustomTokenizerExample():
    def text_data_cleaning(self, text):
        text = pattern.sub(" ", text)
        text = text.lower().strip()
        doc = nlp(text)
        cleaned_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])

        return cleaned_text

In [5]:
token = CustomTokenizerExample()

# Tfidf Vectorizer

In [6]:
X = df['Consumer complaint narrative']
y = df['Category']

In [7]:
from sklearn.model_selection import train_test_split

# Create training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [8]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(285820,) (71456,) (285820,) (71456,)


In [9]:
from sklearn.utils import class_weight
import numpy as np

# Calculate class weights to address imbalance
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# Convert class weights into a dictionary
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0}


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(preprocessor=token.text_data_cleaning,max_features=3000)

# Adding more algorithms

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, random_state=2)

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

In [13]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=50, random_state=2)

In [14]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('etc', etc),
    ('rf', rf),
    ('xgb', xgb)
], voting='hard')

In [15]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tfidf', tfidf ), ('clf', voting_classifier)])

In [16]:
pipeline.fit(X_train,y_train)

In [19]:
y_pred = pipeline.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[15476  1210   889   289]
 [ 1501 15059  1012   292]
 [ 1001   928 15611   324]
 [  291   165   600 16808]]


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86     17864
           1       0.87      0.84      0.85     17864
           2       0.86      0.87      0.87     17864
           3       0.95      0.94      0.94     17864

    accuracy                           0.88     71456
   macro avg       0.88      0.88      0.88     71456
weighted avg       0.88      0.88      0.88     71456



In [22]:
from sklearn.metrics import precision_score    
print(precision_score(y_test,y_pred, average='macro'))
print(precision_score(y_test,y_pred, average='micro'))
print(precision_score(y_test,y_pred, average='weighted'))

0.8813235987468875
0.8810176892073444
0.8813235987468875


In [23]:
# credit 0 but got 2
pipeline.predict(["""received update request info saw charge called rude rep said nothing one explained anything also explained demanded payment holding truck hostage made u sign blank invoice let truck go pay provide service demanded money prove fix truck show look like capital one support fraudulent activity everything providing ability explain situation needed waited time charged back story letter sent thing capital one everything provide ability respond receive mail email call option check claim status online credit posted today"""])

array([2], dtype=int64)

In [24]:
# mortgage 3 but got 2
pipeline.predict(["""credit score sure government declared national replacement general forbearance acceptance choice say yes talked ed finanancial service okay explanation immediate drop credit score would would evermore sane rule explainable mean thing paying surely surely credit notation say good payer partially ruined credit stated way resolved untap world commentary yeah know score involuntarily ask moneyback service service make sure ed financial report go without purpose year associated worker year year exactly another year supposed span started certification kept professional development also much partnership forth anyone yet declaring sore applied acceptance prepay ed financial without ed financial notation every moment year would suggest government please pay loan already paid come would also like total refund immediately refunded advantage resonations thank anytime contact info respond work write viewpionts personal blog donation coupon"""])

array([2], dtype=int64)

In [25]:
# 0 
pipeline.predict(["""call received daughter confirm opening credit card daughter stated knew nothing given phone number give call confirm applying credit card received call daughter given number call upon calling stated request credit card opened name lady attending call stated card could cancelled time needed wait day cancel card invoice sent house card supposed cancelled call company advise request credit card told card would cancelled received court summon default account pretrial pretrial attended son translation purpose advised person bank account opened fraudulently disposition provided voluntary dismissal cpi day understanding meant wanted fraudulent transaction removed credit report believing case dismissed daughter file dispute removed receive new court summon mediation citibank regarding fraudulent account attend mediation son translation purpose attorney citibank state investigation done person opened account never responded invoice called let know fraudulent activity advised man called number given daughter confirm opening account stated called invoice reached house state account opened provide citibank time correct phone number address yet person say account opened wrong information person citibank would let u know account opened person online document used prove identity call daughter verify opening account citibank person want verify information mediation confirm identity passport driver license social security card available review matter account balance used tractor supply company according invoice court document trial date pending afford attorney would like resolve matter would like identity flagged fraudulent activity"""])

array([0], dtype=int64)

In [26]:
import joblib
joblib.dump(pipeline,'ensemble_TFIDF.pkl')

['ensemble_TFIDF.pkl']