In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_excel("/content/BankReviews.xlsx")

In [10]:
df.sample(10)

Unnamed: 0,Date,Stars,Reviews,BankName
211,2017-01-26,5,_x000D_\nFred was awesome to work with! He is ...,North American Savings Bank
321,2017-02-10,5,"_x000D_\nI worked with Adan Munoz, and the loa...",LoanSnap
2,2017-08-21,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
224,2016-06-26,1,_x000D_\ncan someone explain why the APR is mo...,North American Savings Bank
238,2016-11-13,1,_x000D_\nWhere as we did not end up taking a l...,North American Savings Bank
30,2016-07-18,5,Jon Barrett was a great asset to my husband an...,North American Savings Bank
7,2016-08-16,5,Patrick answered all my questions by email imm...,Wyndham Capital Mortgage
0,2017-04-10,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
448,2017-04-10,5,_x000D_\nIt had been ten years since I bought ...,North American Savings Bank
294,2016-06-11,5,_x000D_\nI worked with Steven Castaneda a few ...,Triumph Lending


In [5]:
df.shape

(505, 4)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      505 non-null    datetime64[ns]
 1   Stars     505 non-null    int64         
 2   Reviews   505 non-null    object        
 3   BankName  505 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 15.9+ KB


In [7]:
### Drop Bankname and date columns
banks=df.drop(["BankName","Date"],axis=1)

In [8]:
banks.shape

(505, 2)

In [9]:
### Check whether there are missing values
banks.isnull().sum()

Unnamed: 0,0
Stars,0
Reviews,0


In [11]:
### Data Preparation(Data Cleaning)(Text Preprocessing)

#### Case Conversion(Change into lower case reduce the size of the vocabulary of text data)

def text_lowercase(text):
  return text.lower()

In [12]:
banks["Reviews_clean"]=banks["Reviews"].apply(text_lowercase)

In [13]:
banks.head()

Unnamed: 0,Stars,Reviews,Reviews_clean
0,5,"Great job, Wyndham Capital! Each person was pr...","great job, wyndham capital! each person was pr..."
1,5,Matthew Richardson is professional and helpful...,matthew richardson is professional and helpful...
2,5,We had a past experience with Wyndham Mortgage...,we had a past experience with wyndham mortgage...
3,5,We have been dealing with Brad Thomka from the...,we have been dealing with brad thomka from the...
4,5,I can't express how grateful I am for the supp...,i can't express how grateful i am for the supp...


In [15]:
### Remove the punctuation marks
import string
banks['Reviews_clean'] = banks['Reviews_clean'].str.replace(r'[' + string.punctuation + r']', '', regex=True)

In [18]:
banks.head(25)

Unnamed: 0,Stars,Reviews,Reviews_clean
0,5,"Great job, Wyndham Capital! Each person was pr...",great job wyndham capital each person was prof...
1,5,Matthew Richardson is professional and helpful...,matthew richardson is professional and helpful...
2,5,We had a past experience with Wyndham Mortgage...,we had a past experience with wyndham mortgage...
3,5,We have been dealing with Brad Thomka from the...,we have been dealing with brad thomka from the...
4,5,I can't express how grateful I am for the supp...,i cant express how grateful i am for the suppo...
5,5,I had the pleasure of working with Wyndham Cap...,i had the pleasure of working with wyndham cap...
6,5,My experience with Mattison was beyond greatly...,my experience with mattison was beyond greatly...
7,5,Patrick answered all my questions by email imm...,patrick answered all my questions by email imm...
8,5,I loved working with this group of people! The...,i loved working with this group of people they...
9,5,Great web interface for both the loan applicat...,great web interface for both the loan applicat...


In [19]:
#### Remove whitespaces
def remove_whitespaces(text):
    return " ".join(text.split())
banks['Reviews_clean']=banks['Reviews_clean'].apply(lambda x:remove_whitespaces(x))

In [20]:
banks.head()

Unnamed: 0,Stars,Reviews,Reviews_clean
0,5,"Great job, Wyndham Capital! Each person was pr...",great job wyndham capital each person was prof...
1,5,Matthew Richardson is professional and helpful...,matthew richardson is professional and helpful...
2,5,We had a past experience with Wyndham Mortgage...,we had a past experience with wyndham mortgage...
3,5,We have been dealing with Brad Thomka from the...,we have been dealing with brad thomka from the...
4,5,I can't express how grateful I am for the supp...,i cant express how grateful i am for the suppo...


In [21]:
### Tokenization
import re
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

banks['Reviews_clean'] = banks['Reviews_clean'].apply(lambda x: tokenize(x))
banks.head()

Unnamed: 0,Stars,Reviews,Reviews_clean
0,5,"Great job, Wyndham Capital! Each person was pr...","[great, job, wyndham, capital, each, person, w..."
1,5,Matthew Richardson is professional and helpful...,"[matthew, richardson, is, professional, and, h..."
2,5,We had a past experience with Wyndham Mortgage...,"[we, had, a, past, experience, with, wyndham, ..."
3,5,We have been dealing with Brad Thomka from the...,"[we, have, been, dealing, with, brad, thomka, ..."
4,5,I can't express how grateful I am for the supp...,"[i, cant, express, how, grateful, i, am, for, ..."


In [24]:
import nltk

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
### remove the stopwords

from nltk.corpus import stopwords
stopword = stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

In [27]:
banks['Reviews_clean']=banks['Reviews_clean'].apply(lambda x:remove_stopwords(x))

In [28]:
banks.head()

Unnamed: 0,Stars,Reviews,Reviews_clean
0,5,"Great job, Wyndham Capital! Each person was pr...","[great, job, wyndham, capital, person, profess..."
1,5,Matthew Richardson is professional and helpful...,"[matthew, richardson, professional, helpful, h..."
2,5,We had a past experience with Wyndham Mortgage...,"[past, experience, wyndham, mortgage, would, w..."
3,5,We have been dealing with Brad Thomka from the...,"[dealing, brad, thomka, beginning, started, st..."
4,5,I can't express how grateful I am for the supp...,"[cant, express, grateful, support, zach, provi..."


In [30]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [31]:
### lemmatization

wn=nltk.stem.WordNetLemmatizer()

def lemmatizing(text):
    lemma=[wn.lemmatize(word) for word in text]
    return lemma

banks['Reviews_clean']=banks['Reviews_clean'].apply(lemmatizing)

In [32]:
banks.sample(10)

Unnamed: 0,Stars,Reviews,Reviews_clean
433,5,_x000D_\nAdam was great to work with during ou...,"[x000d, adam, great, work, refi, attention, de..."
167,5,_x000D_\n_x000D_\nJon Barrett was a great asse...,"[x000d, x000d, jon, barrett, great, asset, hus..."
365,5,_x000D_\nGuaranteed Rate provided us a seamles...,"[x000d, guaranteed, rate, provided, u, seamles..."
48,1,"Rep was extremely professional, friendly, and ...","[rep, extremely, professional, friendly, helpf..."
76,5,_x000D_\nThank you for all of your help Kelly!...,"[x000d, thank, help, kelly, patience, kindness..."
187,5,"_x000D_\nFred was very nice, and always availa...","[x000d, fred, nice, always, available, address..."
3,5,We have been dealing with Brad Thomka from the...,"[dealing, brad, thomka, beginning, started, st..."
229,1,_x000D_\nIt all started when Bob G ran a credi...,"[x000d, started, bob, g, ran, credit, check, w..."
46,1,It all started when Bob G ran a credit check w...,"[started, bob, g, ran, credit, check, without,..."
320,5,_x000D_\nI had a great experience working with...,"[x000d, great, experience, working, adan, muno..."


In [33]:
word_count={}

for sentence in banks['Reviews_clean']:
    for word in sentence:

        if word not in word_count:
            word_count[word]=1

        else:
            word_count[word]+=1

In [34]:
word_count

{'great': 139,
 'job': 21,
 'wyndham': 16,
 'capital': 12,
 'person': 17,
 'professional': 70,
 'helped': 35,
 'u': 207,
 'move': 12,
 'refinance': 70,
 'process': 248,
 'smoothly': 14,
 'thank': 50,
 'matthew': 3,
 'richardson': 1,
 'helpful': 51,
 'find': 19,
 'correct': 1,
 'product': 13,
 'mortgage': 151,
 'much': 38,
 'excellent': 29,
 'service': 103,
 'past': 13,
 'experience': 110,
 'would': 246,
 'without': 29,
 'question': 106,
 'use': 48,
 'needed': 29,
 'went': 70,
 'beyond': 25,
 'extra': 19,
 'mile': 8,
 'right': 26,
 'wrong': 7,
 'encountered': 5,
 'servicer': 3,
 'dealing': 16,
 'previous': 13,
 'loan': 326,
 'pulled': 4,
 'together': 8,
 'found': 16,
 'viable': 3,
 'option': 31,
 'ultimately': 5,
 'saved': 7,
 'money': 22,
 'highly': 62,
 'recommend': 150,
 'brad': 7,
 'thomka': 3,
 'team': 112,
 'need': 41,
 'sincerest': 2,
 'thanks': 61,
 'ed': 2,
 'lind': 1,
 'beginning': 32,
 'started': 17,
 'stressful': 24,
 'time': 217,
 'help': 38,
 'entire': 47,
 'turned': 3,
 '

In [39]:
df=pd.DataFrame(word_count.items(), columns=['word','word_count'])
df=df.sort_values('word_count', ascending=False).reset_index(drop=True)
df.head()

Unnamed: 0,word,word_count
0,x000d,715
1,loan,326
2,process,248
3,would,246
4,time,217


In [40]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [41]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [42]:
analyser=SentimentIntensityAnalyzer()

def sentiment_pos(sentence):
    sent=analyser.polarity_scores(sentence)
    return sent['pos']

def sentiment_neg(sentence):
    sent=analyser.polarity_scores(sentence)
    return sent['neg']

In [43]:
df['positive']=df['word'].apply(sentiment_pos)
df['negative']=df['word'].apply(sentiment_neg)

In [44]:
df.sample(25)

Unnamed: 0,word,word_count,positive,negative
549,debited,7,0.0,0.0
94,much,38,0.0,0.0
1553,impressive,2,1.0,0.0
420,voicemail,10,0.0,0.0
1419,mcclung,2,0.0,0.0
1526,transition,2,0.0,0.0
2069,knowledable,1,0.0,0.0
1521,avoid,2,0.0,1.0
356,move,12,0.0,0.0
69,cost,46,0.0,0.0
