In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC

# Approach Explanation

Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity.

1. Documents are tokenized into sentences.
2. Sentence Embeddings is extracted from the Sentence transformer : paraphrase-MiniLM-L6-v2 model for each sentence in documents.<br>
    <b>Note:</b> Embdedding size is 384 for each sentence.
3. Sentence embeddings are averaged out to create document emedding
4. document embeddings along with the label is used for training SVM Model.

In [2]:
train_data = pd.read_csv('train_dataset.csv',index_col=0)
print(train_data.shape,end="\n\n")
print(train_data['Issue'].value_counts())
train_data.head()

(3184, 2)

Problem with a purchase shown on your statement    1993
Incorrect information on your report               1191
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
554,On XX/XX/2018 two charges were made at a vendo...,Problem with a purchase shown on your statement
11901,1. A Citibank ( South Dakota ) N.A. charge off...,Incorrect information on your report
17377,I made purchaseson XXXX that was opened and XX...,Problem with a purchase shown on your statement
8681,on XXXX XX/XX/2020 I filed a dispute with Citi...,Problem with a purchase shown on your statement
5644,On XX/XX/XXXX I purchased XXXX XXXX XXXX conce...,Problem with a purchase shown on your statement


In [3]:
test_data = pd.read_csv('test_dataset.csv',index_col=0)
print(test_data.shape,end="\n\n")
print(test_data['Issue'].value_counts())
test_data.head()

(1364, 2)

Problem with a purchase shown on your statement    854
Incorrect information on your report               510
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
8201,Hello CFPB! Can you please forward the folllow...,Incorrect information on your report
49162,The response from Citi Cards has cleared up a ...,Problem with a purchase shown on your statement
8206,I have made repeated attempts with Macys Credi...,Problem with a purchase shown on your statement
8226,I paid {$120.00} through Citibank credit card ...,Problem with a purchase shown on your statement
57391,At Best Buys request I am submitting documenta...,Incorrect information on your report


In [4]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [5]:
train_data['Embeddings'] = train_data['Consumer complaint narrative'].apply(lambda x: np.mean(model.encode(sent_tokenize(x)),axis=0))
train_data['Label'] = (train_data['Issue']=='Problem with a purchase shown on your statement').astype(int)
train_data.head(2)

Unnamed: 0,Consumer complaint narrative,Issue,Embeddings,Label
554,On XX/XX/2018 two charges were made at a vendo...,Problem with a purchase shown on your statement,"[-0.20680293, 0.06291509, -0.101168156, 0.1477...",1
11901,1. A Citibank ( South Dakota ) N.A. charge off...,Incorrect information on your report,"[-0.18409169, -0.08991251, -0.15696734, 0.0378...",0


In [6]:
test_data['Embeddings'] = test_data['Consumer complaint narrative'].apply(lambda x: np.mean(model.encode(sent_tokenize(x)),axis=0))
test_data['Label'] = (test_data['Issue']=='Problem with a purchase shown on your statement').astype(int)
test_data.head(2)

Unnamed: 0,Consumer complaint narrative,Issue,Embeddings,Label
8201,Hello CFPB! Can you please forward the folllow...,Incorrect information on your report,"[-0.6415856, 0.2658094, -0.07548106, -0.227051...",0
49162,The response from Citi Cards has cleared up a ...,Problem with a purchase shown on your statement,"[-0.18394545, -0.06851138, -0.2390448, 0.05607...",1


In [7]:
X_train = train_data['Embeddings'].apply(pd.Series)
y_train = train_data['Label']

X_test = test_data['Embeddings'].apply(pd.Series)
y_test = test_data['Label']

In [8]:
clf = SVC()
clf.fit(X_train,y_train)

SVC()

In [9]:
predictions = clf.predict(X_test)

In [10]:
accuracy_score(y_test,predictions)

0.9340175953079178

In [11]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       510
           1       0.95      0.94      0.95       854

    accuracy                           0.93      1364
   macro avg       0.93      0.93      0.93      1364
weighted avg       0.93      0.93      0.93      1364



In [12]:
print(confusion_matrix(y_test,predictions))

[[470  40]
 [ 50 804]]


In [13]:
TN,FP,FN,TP = confusion_matrix(y_test,predictions).flatten()

print(TN,FP,FN,TP)

470 40 50 804


In [14]:
FPR = FP/(TN+FP)

FPR

0.0784313725490196

In [15]:
test_data.head()

Unnamed: 0,Consumer complaint narrative,Issue,Embeddings,Label
8201,Hello CFPB! Can you please forward the folllow...,Incorrect information on your report,"[-0.6415856, 0.2658094, -0.07548106, -0.227051...",0
49162,The response from Citi Cards has cleared up a ...,Problem with a purchase shown on your statement,"[-0.18394545, -0.06851138, -0.2390448, 0.05607...",1
8206,I have made repeated attempts with Macys Credi...,Problem with a purchase shown on your statement,"[-0.32731006, -0.31460646, 0.043683853, -0.072...",1
8226,I paid {$120.00} through Citibank credit card ...,Problem with a purchase shown on your statement,"[-0.31925473, -0.0050348165, -0.0012888648, 0....",1
57391,At Best Buys request I am submitting documenta...,Incorrect information on your report,"[-0.4862429, -0.15408775, -0.0962071, -0.08545...",0


In [16]:
test_data_with_results = test_data[['Consumer complaint narrative','Issue','Embeddings','Label']]
test_data_with_results.head()

Unnamed: 0,Consumer complaint narrative,Issue,Embeddings,Label
8201,Hello CFPB! Can you please forward the folllow...,Incorrect information on your report,"[-0.6415856, 0.2658094, -0.07548106, -0.227051...",0
49162,The response from Citi Cards has cleared up a ...,Problem with a purchase shown on your statement,"[-0.18394545, -0.06851138, -0.2390448, 0.05607...",1
8206,I have made repeated attempts with Macys Credi...,Problem with a purchase shown on your statement,"[-0.32731006, -0.31460646, 0.043683853, -0.072...",1
8226,I paid {$120.00} through Citibank credit card ...,Problem with a purchase shown on your statement,"[-0.31925473, -0.0050348165, -0.0012888648, 0....",1
57391,At Best Buys request I am submitting documenta...,Incorrect information on your report,"[-0.4862429, -0.15408775, -0.0962071, -0.08545...",0


In [17]:
test_data_with_results['Prediction'] = predictions

In [18]:
test_data_with_results.head(40)

Unnamed: 0,Consumer complaint narrative,Issue,Embeddings,Label,Prediction
8201,Hello CFPB! Can you please forward the folllow...,Incorrect information on your report,"[-0.6415856, 0.2658094, -0.07548106, -0.227051...",0,0
49162,The response from Citi Cards has cleared up a ...,Problem with a purchase shown on your statement,"[-0.18394545, -0.06851138, -0.2390448, 0.05607...",1,1
8206,I have made repeated attempts with Macys Credi...,Problem with a purchase shown on your statement,"[-0.32731006, -0.31460646, 0.043683853, -0.072...",1,1
8226,I paid {$120.00} through Citibank credit card ...,Problem with a purchase shown on your statement,"[-0.31925473, -0.0050348165, -0.0012888648, 0....",1,1
57391,At Best Buys request I am submitting documenta...,Incorrect information on your report,"[-0.4862429, -0.15408775, -0.0962071, -0.08545...",0,0
52,This is regarding dispute numbers XXXX and XXX...,Problem with a purchase shown on your statement,"[-0.3378458, -0.026797455, -0.23113827, 0.1057...",1,1
57396,I was scammed by an online seller and SEARS in...,Problem with a purchase shown on your statement,"[-0.22414061, -0.13224047, -0.145868, 0.152974...",1,1
54,I have charges that I have disputed with CitiB...,Problem with a purchase shown on your statement,"[-0.23780224, 0.00975199, -0.4164698, 0.137707...",1,1
24630,This complaint if being filed on the advice of...,Incorrect information on your report,"[-0.21943046, -0.16149944, 0.04576512, 0.04763...",0,0
24635,I bought a water softener from Home Depot on m...,Incorrect information on your report,"[-0.1407889, -0.0016049871, 0.11041404, 0.0626...",0,0


In [19]:
test_data_with_results.to_csv('Sentence Bert + SVM Test Prediction Results.csv')