In [1]:
import pandas as pd 
import re
import warnings

from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

In [2]:
file = pd.read_csv(r"C:\Users\ravin\Downloads\bank_reviews3.csv")

In [3]:
file

Unnamed: 0,bank,rating,review_title_by_user,review,rating_title_by_user,useful_count
0,SBI,4.0,"""Best saving""",State Bank Of India is located nearby in our a...,Great!,133
1,SBI,5.0,"""Good service""","I have my salary account in SBI, when I applie...",Blown Away!,89
2,Axis Bank,5.0,"""Excellent Service""",I am using Axis bank saving account for the p...,Blown Away!,48
3,HDFC Bank,5.0,"""Excellent service""",I have my salary bank account in HDFC bank for...,Blown Away!,52
4,review,5.0,"""Good account""","Close to around 10 years, I am holding this Co...",Blown Away!,22
...,...,...,...,...,...,...
995,review,3.5,"""Good network of ATM""",I am holding a saving account with FEDERAL ban...,Pretty good,3
996,Axis Bank,4.0,"""Mobile app is good""",There is no mandatory balance to keep in my Ax...,Great!,0
997,Axis Bank,4.0,"""Unhappy with the charges""","In Axis bank, every month they are charging me...",Great!,0
998,Axis Bank,5.0,"""Good Bank""",I have a salary account with AXIS bank and I h...,Blown Away!,0


In [4]:
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   bank                  1000 non-null   object 
 1   rating                1000 non-null   float64
 2   review_title_by_user  1000 non-null   object 
 3   review                1000 non-null   object 
 4   rating_title_by_user  1000 non-null   object 
 5   useful_count          1000 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 47.0+ KB


# Preprocessing

In [5]:
file['rating_title_by_user'].head()

0         Great!
1    Blown Away!
2    Blown Away!
3    Blown Away!
4    Blown Away!
Name: rating_title_by_user, dtype: object

In [6]:
file['rating_title_by_user'].unique().tolist()

['Great!',
 'Blown Away!',
 'Satisfactory',
 'Excellent!',
 'Unacceptable',
 'Expected more',
 'Pretty good',
 'Bad',
 'Really Bad',
 'Just OK']

In [7]:
file['rating'].value_counts()

5.0    550
4.0    257
3.0     71
4.5     45
2.0     30
3.5     21
1.0     13
0.5     10
2.5      2
1.5      1
Name: rating, dtype: int64

In [8]:
def convert(df, column='enter'):
    data = []
    datan = []
    for i in df[column]:
        if i >= 4:
            data.append('positive')
            datan.append(2)
        elif i < 4 and i >= 2:
            data.append('neutral')
            datan.append(1)
        else:
            data.append('negtive')
            datan.append(0)
    return data, datan
    
       

In [9]:
data, datan = convert(file, column='rating')

In [10]:
file['rating1'] = data
file['ratingn'] = datan

In [11]:
file['rating1'].value_counts()

positive    852
neutral     124
negtive      24
Name: rating1, dtype: int64

In [12]:
data1 = []

In [13]:
for words in file['review']:
        words = str(words)
        data1.append(re.sub(r'[^a-zA-Z\s]', '', words).lower())

In [14]:
file['cleared'] = data1

# Stop Word and toeknization

In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [16]:
file['Tokenized_Reviews'] = file['cleared'].apply(lambda x: word_tokenize(str(x)))

In [17]:
filtered = []

stopword = set(stopwords.words('english'))

In [18]:
for words in file['Tokenized_Reviews']:
    filtered.append([word for word in words if word not in stopword])
    

# Lemmitization 

In [19]:
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

In [20]:
data5 = []

In [21]:
for words in filtered:
    data5.append([lem.lemmatize(word) for word in words])

# Spelling correction 

In [22]:
from autocorrect import Speller

spell = Speller(lang='en')

In [23]:
new = []

In [24]:
for words in data5:
    new.append([spell(word) for word in words])

In [25]:
file['final'] = new

In [26]:
final = pd.DataFrame()
final['review'] = file['final']
final['output'] = file['ratingn']

In [27]:
final

Unnamed: 0,review,output
0,"[state, bank, india, located, nearby, area, op...",2
1,"[salary, account, sbi, applied, card, got, sta...",2
2,"[using, axis, bank, saving, account, past, yea...",2
3,"[salary, bank, account, fdfc, bank, many, year...",2
4,"[close, around, year, holding, corporation, ba...",2
...,...,...
995,"[holding, saving, account, federal, bank, char...",1
996,"[mandatory, balance, keep, axis, bank, interes...",2
997,"[axis, bank, every, month, charging, around, r...",2
998,"[salary, account, axis, bank, hold, account, l...",2


# Vectorization

# WORD2VEC

In [28]:
import gensim 
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [29]:
final['review'] = final['review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

In [30]:
tokenized_text = [text.split() for text in final['review']]
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=2, workers=4)

In [31]:
def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv]
    return np.mean(word2vec_model.wv[doc], axis=0)

In [32]:
X = np.array([document_vector(word2vec_model, doc) for doc in tokenized_text])

In [33]:
experiment = pd.DataFrame(X)

# Encoding 

# Label ENcoder

In [34]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = pd.DataFrame(le.fit_transform(file['rating1']))

# Imbalance Dataset 

In [35]:
from imblearn.over_sampling import SMOTE


In [36]:
smote = SMOTE(random_state=42)


x_resampled, y_resampled = smote.fit_resample(experiment, y)

In [37]:
print(y_resampled.shape, x_resampled.shape)

(2556, 1) (2556, 100)


# Train Test split

In [38]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=.20, random_state=24)

# Model building With WORD2VEC

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [40]:
lr = LogisticRegression(penalty='l1', C= 6.0, solver='saga')
lr.fit(x_train, y_train)

train_lr = lr.predict(x_train)
test_lr = lr.predict(x_test)

# Accuracy and classification report

In [41]:
print('train', accuracy_score(y_train, train_lr), 'test', accuracy_score(test_lr, y_test))

train 0.5934442270058709 test 0.57421875


In [42]:
print('train', classification_report(y_train, train_lr),'---------------------------------------------------------------------------------------------------------------------------','test', classification_report(test_lr, y_test))

train               precision    recall  f1-score   support

           0       0.73      0.66      0.69       672
           1       0.50      0.50      0.50       693
           2       0.57      0.62      0.59       679

    accuracy                           0.59      2044
   macro avg       0.60      0.59      0.60      2044
weighted avg       0.60      0.59      0.59      2044
 --------------------------------------------------------------------------------------------------------------------------- test               precision    recall  f1-score   support

           0       0.66      0.73      0.69       162
           1       0.44      0.41      0.42       171
           2       0.61      0.59      0.60       179

    accuracy                           0.57       512
   macro avg       0.57      0.58      0.57       512
weighted avg       0.57      0.57      0.57       512



# XGBClassifier

In [138]:
rfc = XGBClassifier(n_estimators=95, max_depth=2, learning_rate=0.15, random_state=24)

rfc.fit(x_train, y_train)

x_train_r = rfc.predict(x_train)
x_test_r = rfc.predict(x_test)


# Accuracy Score Classification_report

In [139]:
accuracy_score(y_train, x_train_r)

0.9021526418786693

In [140]:
accuracy_score(x_test_r, y_test)

0.8359375

In [141]:
print(classification_report(y_train, x_train_r))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       672
           1       0.86      0.88      0.87       693
           2       0.89      0.82      0.86       679

    accuracy                           0.90      2044
   macro avg       0.90      0.90      0.90      2044
weighted avg       0.90      0.90      0.90      2044



In [142]:
print(classification_report(x_test_r, y_test))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       188
           1       0.79      0.74      0.76       169
           2       0.73      0.82      0.77       155

    accuracy                           0.84       512
   macro avg       0.83      0.83      0.83       512
weighted avg       0.84      0.84      0.84       512



# cross Validation

In [143]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [144]:
num_folds = 5

cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

cross_val_results1 = cross_val_score(lr, x_resampled, y_resampled, cv=cv, scoring='accuracy')

In [145]:
cross_val_results1

array([0.5703125 , 0.63600783, 0.5890411 , 0.56751468, 0.59295499])

In [146]:
cross_val_results1.mean()

0.5911662181996087

In [147]:
num_folds = 5

cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

cross_val_results = cross_val_score(rfc, x_resampled, y_resampled, cv=cv, scoring='accuracy')

In [148]:
cross_val_results

array([0.82617188, 0.8590998 , 0.8297456 , 0.82191781, 0.82387476])

In [149]:
cross_val_results.mean()

0.8321619679549901

# Model 2 with TFIDF

# TFIDF

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')


X_train_tfidf = tfidf_vectorizer.fit_transform(final['review']).toarray()

vector = pd.DataFrame(X_train_tfidf)

# imbalance Dataset 

In [56]:
x_resampled1, y_resampled1 = smote.fit_resample(vector, y)
print(x_resampled1.shape, y_resampled1.shape)

(2556, 1200) (2556, 1)


# Feature Extraction

In [57]:
from sklearn.decomposition import PCA

In [150]:
pc = PCA(n_components=35)

compressed = pd.DataFrame(pc.fit_transform(x_resampled1))

In [151]:
compressed.shape

(2556, 35)

# Train test Split

In [152]:
from sklearn.model_selection import train_test_split

x_train1, x_test1, y_train1, y_test1 = train_test_split(compressed, y_resampled1, test_size=.20, random_state=24)

# Logistic Regression

In [153]:
lr1 = LogisticRegression(penalty='l1', C= 6.0, solver='saga')
lr1.fit(x_train1, y_train1)

train_lr1 = lr1.predict(x_train1)
test_lr1 = lr1.predict(x_test1)

# Accuracy Score and Classification report

In [154]:
print('train', accuracy_score(y_train1, train_lr1), 'test', accuracy_score(test_lr1, y_test1))

train 0.8556751467710372 test 0.826171875


In [155]:
print(classification_report(y_train1, train_lr1))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       672
           1       0.79      0.79      0.79       693
           2       0.79      0.78      0.79       679

    accuracy                           0.86      2044
   macro avg       0.86      0.86      0.86      2044
weighted avg       0.85      0.86      0.86      2044



In [156]:
print(classification_report(test_lr1, y_test1))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       183
           1       0.75      0.71      0.73       169
           2       0.71      0.77      0.74       160

    accuracy                           0.83       512
   macro avg       0.82      0.82      0.82       512
weighted avg       0.83      0.83      0.83       512



# XGBClassifier

In [157]:
rfc1 = XGBClassifier(n_estimators=89, max_depth=2, learning_rate=0.10)

rfc1.fit(x_train1, y_train1)

x_train_r1 = rfc1.predict(x_train1)
x_test_r1 = rfc1.predict(x_test1)


# Accuracy Score and Classification report

In [158]:
print('train', accuracy_score(y_train1, x_train_r1), 'test', accuracy_score(x_test_r1, y_test1))

train 0.9055772994129159 test 0.853515625


In [159]:
print(classification_report(y_train1, x_train_r1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       672
           1       0.87      0.84      0.86       693
           2       0.85      0.88      0.86       679

    accuracy                           0.91      2044
   macro avg       0.91      0.91      0.91      2044
weighted avg       0.91      0.91      0.91      2044



In [160]:
print(classification_report(x_test_r1, y_test1))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       180
           1       0.78      0.76      0.77       163
           2       0.78      0.80      0.79       169

    accuracy                           0.85       512
   macro avg       0.85      0.85      0.85       512
weighted avg       0.85      0.85      0.85       512



# Cross Validation 

In [161]:
num_folds = 5

cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

cross_val_results12 = cross_val_score(lr1, compressed, y_resampled1, cv=cv, scoring='accuracy')

In [162]:
cross_val_results12.mean()

0.8485995596868884

In [163]:
cross_val_results12

array([0.828125  , 0.86497065, 0.85127202, 0.83170254, 0.86692759])

In [164]:
num_folds = 5

cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

cross_val_results14 = cross_val_score(rfc1, compressed, y_resampled1, cv=cv, scoring='accuracy')

In [165]:
cross_val_results14

array([0.85351562, 0.87671233, 0.87279843, 0.86692759, 0.87084149])

In [166]:
cross_val_results14.mean()

0.8681590936888455