In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder

from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC,OneClassSVM

# Approach Explanation

Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity.

1. Documents are tokenized into sentences.
2. Sentence Embeddings is extracted from the Sentence transformer : paraphrase-MiniLM-L6-v2 model for each sentence in documents.<br>
    <b>Note:</b> Embdedding size is 384 for each sentence.
3. Sentence embeddings are averaged out to create document emedding
4. document embeddings along with the label is used for training SVM Model.

In [2]:
train_data = pd.read_csv('train_dataset_multiclass.csv',index_col=0)
print(train_data.shape,end="\n\n")
print(train_data['Issue'].value_counts())
train_data.head()

(5908, 2)

Problem with a purchase shown on your statement    1993
Incorrect information on your report               1191
Managing an account                                 935
Fees or interest                                    904
Other features, terms, or problems                  885
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
57519,The Citibank Credit Card company is a ridiculo...,Problem with a purchase shown on your statement
2790,Hi I went to a local bar on the evening of XX/...,Problem with a purchase shown on your statement
54277,I transferred XXXX ThankYou Points ( Citibank ...,"Other features, terms, or problems"
587,I was in XXXX XXXX and used my CITIBANK credit...,Problem with a purchase shown on your statement
12420,I clicked a promotion on citibank 's website t...,Managing an account


In [3]:
test_data = pd.read_csv('test_dataset_multiclass.csv',index_col=0)
print(test_data.shape,end="\n\n")
print(test_data['Issue'].value_counts())
test_data.head()

(2532, 2)

Problem with a purchase shown on your statement    854
Incorrect information on your report               510
Managing an account                                401
Fees or interest                                   388
Other features, terms, or problems                 379
Name: Issue, dtype: int64


Unnamed: 0,Consumer complaint narrative,Issue
17357,"Back in 2015, we noticed a charge on our Macy ...",Fees or interest
6524,"- On XX/XX/XXXX, I purchased 1 ticket XXXX {$3...",Problem with a purchase shown on your statement
23689,I hope all is well ... \nThe reason why Im wri...,Fees or interest
6960,Citibank had requested some XXXX documents ear...,Managing an account
7204,On XX/XX/2021 I authorized a payment of amount...,Managing an account


In [4]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [5]:
train_data['Embeddings'] = train_data['Consumer complaint narrative'].apply(lambda x: np.mean(model.encode(sent_tokenize(x)),axis=0))

In [6]:
train_data['Label'] = (train_data['Issue']=='Problem with a purchase shown on your statement').astype(int)

In [7]:
train_data['Label'].value_counts()

0    3915
1    1993
Name: Label, dtype: int64

In [8]:
test_data['Embeddings'] = test_data['Consumer complaint narrative'].apply(lambda x: np.mean(model.encode(sent_tokenize(x)),axis=0))

In [9]:
test_data['Label'] = (test_data['Issue']=='Problem with a purchase shown on your statement').astype(int)

In [10]:
test_data['Label'].value_counts()

0    1678
1     854
Name: Label, dtype: int64

In [11]:
one_class_train_data = train_data[train_data['Label']==1]

In [12]:
one_class_train_data.head()

Unnamed: 0,Consumer complaint narrative,Issue,Embeddings,Label
57519,The Citibank Credit Card company is a ridiculo...,Problem with a purchase shown on your statement,"[-0.2623762, -0.222208, -0.35714966, 0.0858370...",1
2790,Hi I went to a local bar on the evening of XX/...,Problem with a purchase shown on your statement,"[-0.09492934, 0.12373071, -0.19660375, -0.0297...",1
587,I was in XXXX XXXX and used my CITIBANK credit...,Problem with a purchase shown on your statement,"[-0.2384191, -0.1726675, -0.1371744, 0.1643079...",1
6618,XXXX XXXX XXXX XXXX XXXX XXXX ILLINOIS XXXX C...,Problem with a purchase shown on your statement,"[-0.31163684, -0.031842433, -0.21842934, 0.134...",1
14373,The account had two pending promotional offers...,Problem with a purchase shown on your statement,"[-0.24707998, 0.09534607, 0.07385148, -0.07841...",1


In [13]:
one_class_X_train = one_class_train_data['Embeddings'].apply(pd.Series)

In [14]:
one_class_X_train.shape

(1993, 384)

In [15]:
svc_one_class_model = OneClassSVM()
svc_one_class_model.fit(one_class_X_train)

OneClassSVM()

In [16]:
pd.Series(svc_one_class_model.predict(one_class_X_train)).value_counts()

 1    997
-1    996
dtype: int64

In [17]:
#X_train = train_data['Embeddings'].apply(pd.Series)
#y_train = train_data['Label']

X_test = test_data['Embeddings'].apply(pd.Series)
y_test = test_data['Label']

In [18]:
y_test.value_counts()

0    1678
1     854
Name: Label, dtype: int64

In [19]:
predictions = svc_one_class_model.predict(X_test)
predictions

array([-1,  1, -1, ...,  1, -1, -1], dtype=int64)

In [20]:
predictions[predictions==-1]=0
predictions

array([0, 1, 0, ..., 1, 0, 0], dtype=int64)

In [21]:
accuracy_score(y_test,predictions)

0.6615323854660348

In [22]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75      1678
           1       0.50      0.48      0.49       854

    accuracy                           0.66      2532
   macro avg       0.62      0.62      0.62      2532
weighted avg       0.66      0.66      0.66      2532



In [23]:
print(confusion_matrix(y_test,predictions))

[[1267  411]
 [ 446  408]]


In [24]:
TN,FP,FN,TP = confusion_matrix(y_test,predictions).flatten()

print(TN,FP,FN,TP)

1267 411 446 408


In [25]:
FPR = FP/(TN+FP)

FPR

0.24493444576877235