In [15]:
#import packages
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer

#import nltk
#nltk.download("wordnet")

In [16]:
#import passports
passPort = pd.read_excel('\\\\ad.ing.net\\WPS\\NL\\P\\GD\\012223\\01 MARKETING INTELLIGENCE\\AA_Projects\\201808_passportNPL_Analysis\\passport.xlsx')

In [17]:
#extract and filter purpose and success
purpose = passPort[['Purpose']]
purpose = purpose[~purpose['Purpose'].isnull()]
success = passPort[['Success']]
success = success[~success['Success'].isnull()]

In [18]:
#Pre-processing dataset
trainPurpose = purpose
trainSuccess = success

#create all lowercase
trainPurpose['Purpose'] = trainPurpose['Purpose'].apply(lambda x: " ".join(x.lower() for x in x.split()))
trainSuccess['Success'] = trainSuccess['Success'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#remove punctuation
trainPurpose['Purpose'] = trainPurpose['Purpose'].str.replace('[^\w\s]','')
trainSuccess['Success'] = trainSuccess['Success'].str.replace('[^\w\s]','')

#remove stopwords/commonly occurring words
stop = stopwords.words('english')
trainPurpose['Purpose'] = trainPurpose['Purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
trainSuccess['Success'] = trainSuccess['Success'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# Spelling correction
#from textblob import TextBlob
#train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

#lemmatization [convert words to its stem]
trainPurpose['Purpose'] = trainPurpose['Purpose'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
trainSuccess['Success'] = trainSuccess['Success'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Frequently used words
freqT10_purpose = pd.Series(' '.join(trainPurpose['Purpose']).split()).value_counts()[:10]
freqT10_success = pd.Series(' '.join(trainSuccess['Success']).split()).value_counts()[:10]
#remove these words
#freqT10_purpose = list(freqT10_purpose.index)
#freqT10_success = list(freqT10_success.index)
#trainPurpose['Purpose'] = trainPurpose['Purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in freqT10_purpose))
#trainSuccess['Success'] = trainSuccess['Success'].apply(lambda x: " ".join(x for x in x.split() if x not in freqT10_success))

# Rare used words
freqTmin10_purpose = pd.Series(' '.join(trainPurpose['Purpose']).split()).value_counts()[-10:]
freqTmin10_success = pd.Series(' '.join(trainSuccess['Success']).split()).value_counts()[-10:]
#remove these words
#freqTmin10_purpose = list(freqTmin10_purpose.index)
#freqTmin10_success = list(freqTmin10_success.index)
#trainPurpose['Purpose'] = trainPurpose['Purpose'].apply(lambda x: " ".join(x for x in x.split() if x not in freqTmin10_purpose))
#trainSuccess['Success'] = trainSuccess['Success'].apply(lambda x: " ".join(x for x in x.split() if x not in freqTmin10_success))

In [19]:
#Create N grams purpose
df = trainPurpose[['Purpose']]

for run in range(1,6) :

    word_vectorizer = CountVectorizer(ngram_range=(run,run+1), analyzer='word')

    sparse_matrix = word_vectorizer.fit_transform(df['Purpose'])

    frequencies = sum(sparse_matrix).toarray()[0]

    dfOut = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])

    dfOut = dfOut.sort_values(by=['frequency'], ascending=False)

    dfOut = dfOut.reset_index()
    if run == 1:
        nGramResultsPurpose = dfOut.iloc[[0,1,2,3,4],[0,1]]
    else:    
        nGramResultsPurpose = pd.concat([nGramResultsPurpose, dfOut.iloc[[0,1,2,3,4],[0,1]]], axis=1, sort=False)

In [20]:
#Create N grams purpose
df = trainSuccess[['Success']]

for run in range(1,6) :

    word_vectorizer = CountVectorizer(ngram_range=(run,run+1), analyzer='word')

    sparse_matrix = word_vectorizer.fit_transform(df['Success'])

    frequencies = sum(sparse_matrix).toarray()[0]

    dfOut = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])

    dfOut = dfOut.sort_values(by=['frequency'], ascending=False)

    dfOut = dfOut.reset_index()
    if run == 1:
        nGramResultsSuccess = dfOut.iloc[[0,1,2,3,4],[0,1]]
    else:    
        nGramResultsSuccess = pd.concat([nGramResultsSuccess, dfOut.iloc[[0,1,2,3,4],[0,1]]], axis=1, sort=False)

In [21]:
print('--- Size Purpose ---')
print(trainPurpose.shape[0])
print('--- Size Success ---')
print(trainSuccess.shape[0])

--- Size Purpose ---
1834
--- Size Success ---
1642


In [22]:
nGramResultsPurpose

Unnamed: 0,index,frequency,index.1,frequency.1,index.2,frequency.2,index.3,frequency.3,index.4,frequency.4
0,relationship,610,relationship bank,77,top tier lender,40,become top tier lender,28,search opportunity explore refinancing hedging,26
1,bank,529,financial market,68,access strategic discussion,33,continue support client financing,26,acquisition search opportunity explore refinan...,26
2,client,471,top tier,67,become top tier,30,search opportunity explore refinancing,26,acquisition search opportunity explore refinan...,26
3,business,378,share wallet,66,greater share wallet,28,search opportunity explore refinancing hedging,26,tuckin acquisition search opportunity explore ...,25
4,product,314,strategic dialogue,61,become top tier lender,28,acquisition search opportunity explore refinan...,26,support client financing tuckin acquisition,25


In [23]:
nGramResultsSuccess

Unnamed: 0,index,frequency,index.1,frequency.1,index.2,frequency.2,index.3,frequency.3,index.4,frequency.4
0,fm,350,share wallet,78,fair share wallet,47,achieve fair share wallet,27,fair share wallet ie fee paid,25
1,relationship,321,fair share,52,increase lending commitment,28,pitch lead financing company,26,pitch lead financing company eventually,25
2,business,313,fm business,47,achieve fair share,27,fee paid lender increase,25,fee paid lender increase lending,25
3,bank,281,fair share wallet,47,achieve fair share wallet,27,lender increase lending commitment,25,fee paid lender increase lending commitment,25
4,client,275,success would,44,pitch lead financing company,26,achieve fair share wallet ie,25,fair share wallet ie fee,25
