In [1]:
import csv                               # csv reader
import pandas as pd
import re                                       # regular expressions
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier

# To do preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support, accuracy_score # to report on precision and recall
import numpy as np # To compute the average results

from random import shuffle # To shuffle the dataset


# To use feature selection in the Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path_of_file = '/content/drive/MyDrive/Colab Notebooks/Logically.ai/complaints.csv'

In [4]:
columns = ["Complaint ID", "Company", "Issue", "Consumer complaint narrative", "Sub-product", "Product"]
df = pd.read_csv(path_of_file, usecols= columns)

In [5]:
df.head()

Unnamed: 0,Product,Sub-product,Issue,Consumer complaint narrative,Company,Complaint ID
0,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,,CAPITAL ONE FINANCIAL CORPORATION,3274605
1,Vehicle loan or lease,Loan,Struggling to pay your loan,I contacted Ally on Friday XX/XX/XXXX after fa...,ALLY FINANCIAL INC.,3425257
2,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",3198225
3,Credit card or prepaid card,General-purpose credit card or charge card,Fees or interest,I have 2 Capital One credit cards and My wife ...,CAPITAL ONE FINANCIAL CORPORATION,3202016
4,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,,"EQUIFAX, INC.",3206138


## check for Null values

In [6]:
df.isnull().values.any()

True

In [7]:
nan_values = df.isna()
nan_columns = nan_values.any()
columns_with_nan = df.columns[nan_columns].tolist()
print(columns_with_nan)

['Sub-product', 'Consumer complaint narrative']


## here because its a string data so it is filled by empty string

In [8]:
df.fillna('', inplace=True)
df.isnull().values.any()

False

## Checking class imbalance

In [9]:
df['Product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    837147
Debt collection                                                                 382216
Mortgage                                                                        337857
Credit reporting                                                                140432
Credit card or prepaid card                                                     122003
Checking or savings account                                                     101178
Credit card                                                                      89190
Bank account or service                                                          86206
Student loan                                                                     63174
Money transfer, virtual currency, or money service                               32982
Consumer Loan                                                                    31602
Vehicle loan or lease                      

In [10]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['Product'] = le.fit_transform(df['Product'])
# print(le.inverse_transform(1))

In [11]:
df.head()

Unnamed: 0,Product,Sub-product,Issue,Consumer complaint narrative,Company,Complaint ID
0,6,Credit reporting,Incorrect information on your report,,CAPITAL ONE FINANCIAL CORPORATION,3274605
1,16,Loan,Struggling to pay your loan,I contacted Ally on Friday XX/XX/XXXX after fa...,ALLY FINANCIAL INC.,3425257
2,6,Credit reporting,Incorrect information on your report,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",3198225
3,4,General-purpose credit card or charge card,Fees or interest,I have 2 Capital One credit cards and My wife ...,CAPITAL ONE FINANCIAL CORPORATION,3202016
4,6,Credit reporting,Incorrect information on your report,,"EQUIFAX, INC.",3206138


## method for Loading the data and splitting the data

In [12]:
def loadData(dataframe):
  for index, row in dataframe.iterrows():
    (Id, company, issue, complaint_narative, sub_product, product) = parseReviewImproved(row)
    rawData.append((Id, company, issue, complaint_narative, sub_product, product))


def parseReviewImproved(reviewLine):
    Id = int(reviewLine['Complaint ID'])
    company = reviewLine['Company']
    issue = reviewLine['Issue']
    # sub_issue = reviewLine['Sub-issue']
    complaint_narative = reviewLine['Consumer complaint narrative']
    sub_product = reviewLine['Sub-product']
    product = reviewLine['Product']
    return (Id, company, issue, complaint_narative, sub_product, product)

def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, company, issue, complaint_narative, sub_product, product) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(company, sub_product, preProcess(issue), preProcess(complaint_narative)),product))
    for (_, company, issue, complaint_narative, sub_product, product) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(company, sub_product, preProcess(issue), preProcess(complaint_narative)),product))

## converting tokens to feature vectore

In [14]:
featureDict = {} # the global feature dictionary

def toFeatureVector(company, sub_product, issue_tokens, complaint_tokens):

    # return a dictionary 'featureVect' where the keys are the tokens in 'words' and the values are the number of occurrences of the tokens
    # start by using binary values only:
    # baseDict = {}
    featureVec = {}

    for w in issue_tokens:
      try:
          featureVec[w] += 1.0/len(issue_tokens)
      except KeyError:
          featureVec[w] = 1.0/len(issue_tokens)
      try:
          featureDict[w] += 1.0/len(issue_tokens)
      except KeyError:
          featureDict[w] = 1.0/len(issue_tokens)

        
    for w in complaint_tokens:
      try:
          featureVec[w] += 1.0/len(complaint_tokens)
      except KeyError:
          featureVec[w] = 1.0/len(complaint_tokens)
      try:
          featureDict[w] += 1.0/len(complaint_tokens)
      except KeyError:
          featureDict[w] = 1.0/len(complaint_tokens)

    
    featureVec['COMPANY:'+str(company).lower()] = 1.0 # 0.5
    featureVec['SUB:'+str(sub_product).lower()] = 1.0 #0.3

    try:
        featureDict['COMPANY:'+str(company)] += 1.0
    except KeyError:
        featureDict['COMPANY:'+str(company)] = 1.0

    try:
        featureDict['SUB:'+str(sub_product)] += 1.0
    except KeyError:
        featureDict['SUB:'+str(sub_product)] = 1.0

    
    return featureVec

## Preprocessing the Data and the complaints and making it simple

In [15]:
def preProcess(text):
    # should return a list of tokens
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    # lowercasing
    tokens = [t.lower() for t in tokens]

    # stopword removal- benefits are it removes rare words, though bad for bigram relations
    if True:
        stop = set(stopwords.words('english'))
        tokens = [t for t in tokens if t not in stop]
    
    # lemmatisation
    if True:
        lemmatiser = WordNetLemmatizer()
        tokens = [lemmatiser.lemmatize(t) for t in tokens]
        
    # stemming- works well with stop word remove
    if False:
        p_stemmer = PorterStemmer()
        tok = []
        for t in tokens:
            tok.append(p_stemmer.stem(t))
        tokens = tok
        
    tokens = [t for t in tokens if t] # ensure no empty space
    
    return tokens


In [16]:
print(preProcess("Dear Consumer Financial Protection Bureau Someone had fraudulently created a credit card account with XXXX XXXX and charged over {$5200.00} under my name and this amount has been over due for several months now. I do not bank with XXXX XXXX. "))

['dear', 'consumer', 'financial', 'protection', 'bureau', 'someone', 'fraudulently', 'created', 'credit', 'card', 'account', 'xxxx', 'xxxx', 'charged', '5200', '00', 'name', 'amount', 'due', 'several', 'month', 'bank', 'xxxx', 'xxxx']


In [17]:

# loading reviews
rawData = [] 
trainData = [] 
testData = [] 
# do the actual stuff
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing the dataset...")
loadData(df)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing training and test data...")
splitData(0.8)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))



Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 2286328 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 2286328 rawData, 1829062 trainData, 457266 testData


# defining out training classifier which is linearSVC with Tfidf Transformer

In [18]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('tfidf', TfidfTransformer()),('svc', LinearSVC(loss = 'hinge'))])
    return SklearnClassifier(pipeline).train(trainData)

# method for Predicting the labels

In [19]:
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

# method for cross validation to 10 folds

In [20]:
def crossValidate(dataset, folds):
    shuffle(dataset) # this shuffles the dataset
    cv_results = [] # an empty list for storing the results
    accuracy = []
    foldSize = int(len(dataset)/folds) # foldsize is the size we get dividing the length of dataset by number of folds
    
    precision = recall = fscore = 0 # variables to store the average result
    
    for i in range(0,len(dataset),foldSize):
        # spitting the dataset in training and validation set
        trainingData = dataset[0:i] + dataset[i+foldSize:]
        validationData = dataset[i:i+foldSize]
        # training the classifier on the split training data
        classifier = trainClassifier(trainingData)
        # this variable stores the label for final scores
        testTrue = [label[1] for label in validationData]
        # predicting labels on the validation data
        testPred = predictLabels(validationData, classifier)
        # finalscores stores the value for precision recall fscore and accuracy
        finalScores = precision_recall_fscore_support(testTrue, testPred, average = 'weighted')
        accuracy.append(accuracy_score(testTrue, testPred))
        cv_results.append(finalScores)
        
    print('Cross validation done on the dataset and metrics scores are')
    
    #this for loop block takes the mean of precision, recall, fscore for 10 folds of training and validation data
    for scores in cv_results:
        precision += scores[0]
        recall += scores[1]
        fscore += scores[2]
        
    cv_results = (precision/len(cv_results), recall/len(cv_results), fscore/len(cv_results))
    accuracy_result = sum(accuracy)/ len(accuracy) 
    return cv_results, accuracy_result

In [21]:
cv_results, accuracy = crossValidate(trainData, 10)
print("Precision: %f\nRecall: %f\nF Score:%f" % cv_results[:3])
print("accuracy from cross validation is: ", accuracy)

Training Classifier...


  _warn_prf(average, modifier, msg_start, len(result))


Training Classifier...




Training Classifier...


  _warn_prf(average, modifier, msg_start, len(result))


Training Classifier...




Training Classifier...




Training Classifier...




Training Classifier...


  _warn_prf(average, modifier, msg_start, len(result))


Training Classifier...




Training Classifier...


  _warn_prf(average, modifier, msg_start, len(result))


Training Classifier...




Training Classifier...
Cross validation done on the dataset and metrics scores are
Precision: 0.996608
Recall: 0.996528
F Score:0.996497
accuracy from cross validation is:  0.9965277743262064




## accuracy on the Test Data

In [22]:
print(testData[0])   # have a look at the first test data instance
classifier = trainClassifier(trainData)  # train the classifier
testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
accuracy = accuracy_score(testTrue, testPred)
print("Done training!")
print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
print("accuracy om Test Data is: ", accuracy)


({'loan': 0.25, 'modification': 0.25, 'collection': 0.25, 'foreclosure': 0.25, 'COMPANY:ocwen financial corporation': 1.0, 'SUB:other mortgage': 1.0}, 10)
Training Classifier...




Done training!
Precision: 0.995818
Recall: 0.994994
F Score:0.995180
accuracy om Test Data is:  0.9949941609478946
