In [1]:
import pandas as pd
import numpy as np


In [2]:
df_original = pd.read_csv('reviews.csv')

In [3]:
df = df_original.copy()
df['y'] = df.Label.replace({1: 0, 2: 0, 3: 1, 4: 2, 5: 2})
df.tail()

Unnamed: 0,Id,Review,Label,y
107013,107013,Trendy topic with talks from expertises in the...,4,2
107014,107014,"Wonderful! Simple and clear language, good ins...",5,2
107015,107015,an interesting and fun course. thanks. dr quincy,5,2
107016,107016,"very broad perspective, up to date information...",4,2
107017,107017,An informative course on the social and financ...,4,2


In [4]:
df['y'].value_counts()

2    97227
1     5071
0     4720
Name: y, dtype: int64

In [5]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

stemmer = WordNetLemmatizer()
porter = PorterStemmer()
STOPWORDS = set(stopwords.words('english'))

def preprocessingText(corpus, lowercase=True, rmPunctuation=True, rpURL=True, rpNumber=True, stemming=True):
    """Input is assumed to be vector of documents"""
    documents = []
    for text in corpus:
        document = text
        
        # HYPERPARAMETER
        # Converting to Lowercase
        if lowercase:
            document = document.lower()

        # replace URL
        if rpURL:
            # replace URL
            document = re.sub(r'http\S+', 'url', document, flags=re.MULTILINE)

        # replace numbers
        if rpNumber:
            document = re.sub("\d+", "number", document)

        # remove all special characters including punctuation
        if rmPunctuation:
            # only keep word
            document = re.sub(r'\W', ' ', document)
            # remove all single characters
            document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
            # Remove single characters from the start
            document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # OTHER PREPROCESSING METHODS
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)
        
        # removing stopwords
        document = document.split()
        document = [word for word in document if word not in STOPWORDS]

        if stemming:
            # Lemmatization
            document = [stemmer.lemmatize(word) for word in document]
            # stemming
            document = [porter.stem(word) for word in document]

        document = ' '.join(document)
        documents.append(document)
    return documents

In [6]:
def preprocessingText_not_moving_stop_words(corpus, lowercase=True, rmPunctuation=True, rpURL=True, rpNumber=True, stemming=True):
    """Input is assumed to be vector of documents"""
    documents = []
    for text in corpus:
        document = text
        
        # HYPERPARAMETER
        # Converting to Lowercase
        if lowercase:
            document = document.lower()

        # replace URL
        if rpURL:
            # replace URL
            document = re.sub(r'http\S+', 'url', document, flags=re.MULTILINE)

        # replace numbers
        if rpNumber:
            document = re.sub("\d+", "number", document)

        # remove all special characters including punctuation
        if rmPunctuation:
            # only keep word
            document = re.sub(r'\W', ' ', document)
            # remove all single characters
            document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
            # Remove single characters from the start
            document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # OTHER PREPROCESSING METHODS
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)
        
        # removing stopwords
        document = document.split()
        document = [word for word in document if word not in STOPWORDS]

        if stemming:
            # Lemmatization
            document = [stemmer.lemmatize(word) for word in document]
            # stemming
            document = [porter.stem(word) for word in document]

        document = ' '.join(document)
        documents.append(document)
    return documents

In [7]:
df['text'] = preprocessingText(df.Review)
print(df.tail().text)

107013    trendi topic talk expertis field cover area in...
107014    wonder simpl clear languag good instructor gre...
107015                   interest fun cours thank dr quinci
107016    broad perspect date inform use link video good...
107017    inform cours social financi implic due zika we...
Name: text, dtype: object


In [8]:
np.random.seed(8307)
positive_indices = df[df.y == 2].index
random_indices = np.random.choice(positive_indices, 5071, replace=False)
positive_sample = df.loc[random_indices]
positive_sample

Unnamed: 0,Id,Review,Label,y,text
67348,67348,The course is very interesting. In the beginni...,5,2,cours interest begin seem bit heavi neurosci i...
38747,38747,Considering that internet and media regulation...,5,2,consid internet medium regul hot topic polici ...
17832,17832,Quite useful for introducing Data Science. I w...,5,2,quit use introduc data scienc enrol data scien...
65168,65168,Great course!This course is perfect for beginn...,5,2,great cours cours perfect beginn interest lear...
46273,46273,Very useful and effective,4,2,use effect
...,...,...,...,...,...
83880,83880,It's a very good course. I learn more about th...,5,2,good cours learn lesson thank teacher lesson
30829,30829,This is a really good start.,5,2,realli good start
8588,8588,good for basic knowledge in cancer biology.,5,2,good basic knowledg cancer biolog
91826,91826,Fantastic course as always from the legendary ...,5,2,fantast cours alway legendari dr chuck


In [9]:
df2 = pd.concat([positive_sample, df[df['y'] != 2]], verify_integrity=True)
df2['y'].value_counts()

2    5071
1    5071
0    4720
Name: y, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df2["text"], list(df2['y']), 
                                                    test_size=0.1, 
                                                    stratify=df2['y'],
                                                    random_state=8307)


In [11]:
from nltk import word_tokenize
X_train_token = []
X_test_token = []

In [12]:
for words in X_train:
    words = word_tokenize(words)
    X_train_token.append(words)

for words in X_test:
    words = word_tokenize(words)
    X_test_token.append(words)    

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_token)
word_index = tokenizer.index_word

In [15]:
encoded_train_data = tokenizer.texts_to_sequences(X_train_token)

In [16]:
encoded_test_data = tokenizer.texts_to_sequences(X_test_token)

In [17]:
max_length = max([len(w) for w in encoded_train_data])

In [18]:
max_length

576

In [19]:
X_train_pad = pad_sequences(encoded_train_data, 
                        maxlen=max_length, 
                        padding='post')

In [20]:
X_test_pad = pad_sequences(encoded_test_data, 
                        maxlen=max_length, 
                        padding='post')

In [21]:
y_train_dummy = pd.get_dummies(y_train)

In [22]:
y_test_dummy = pd.get_dummies(y_test)

tf-idf embedding

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [25]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)

X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [26]:
X_train_array = X_train_tf.toarray()
X_test_array = X_test_tf.toarray()

In [27]:
X_train_array.shape

(13375, 9618)

In [28]:
tfidf_dict = tfidf.vocabulary_

In [29]:
tfdf_embedding_matrix = np.zeros((len(word_index)+1,100))

In [30]:
for k,v in word_index.items():
    try:
        vector = tfidf_dict[v]
        tfdf_embedding_matrix[k] = vector
    except:
        continue
        
        

In [31]:
tfdf_embedding_matrix

array([[   0.,    0.,    0., ...,    0.,    0.,    0.],
       [1807., 1807., 1807., ..., 1807., 1807., 1807.],
       [5686., 5686., 5686., ..., 5686., 5686., 5686.],
       ...,
       [3508., 3508., 3508., ..., 3508., 3508., 3508.],
       [2469., 2469., 2469., ..., 2469., 2469., 2469.],
       [9551., 9551., 9551., ..., 9551., 9551., 9551.]])

In [32]:
tfdf_embedding_matrix.shape

(9645, 100)

word2vec embedding

In [33]:
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils
import torch
import gensim.downloader as api


In [34]:
model = Word2Vec(X_train_token,epochs=50)

In [35]:
my_wv = model.wv

In [36]:
w2vdict = {}
for key in my_wv.index_to_key:
    w2vdict[key] = my_wv[key]

In [37]:
len(w2vdict)


3067

In [38]:
#matrix
embedding_matrix = np.zeros((len(word_index)+1,100))

In [39]:
for k,v in word_index.items():
    try:
        vector = model.wv[v]
        embedding_matrix[k] = vector
    except:
        continue
        

In [40]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00730391, -0.48694927, -0.16433622, ...,  1.07726753,
         0.24238859,  0.92591852],
       [ 0.04689787,  1.3033303 ,  1.31418991, ...,  0.20105635,
        -1.60167646, -0.41524702],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## NN TF-IDF

In [41]:
import os
import torch
import random
from torch.utils.data import Dataset

In [37]:
class NLPDATA(Dataset):
    def __init__(self,X,Y,feature_type = torch.int64,target_type =torch.int64):
        self.x = X
        self.y = Y
        self.feature_type = feature_type
        self.target_type = target_type
    
    def __getitem__(self, idx):
        feature = self.x[idx]
        feature = torch.tensor(feature,dtype = self.feature_type)
        target = self.y[idx]
        target = torch.tensor(target,dtype=self.target_type)
        
        return feature,target
    
    def __len__(self):
        return len(self.x)


In [38]:
traindata_tf = NLPDATA(X_train_array,y_train,feature_type=torch.float32)
valdata_tf = NLPDATA(X_test_array,y_test,feature_type=torch.float32)

In [33]:
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.nn import BCELoss

In [40]:
class NormalNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        
        super().__init__()
                
        self.classifier = nn.Sequential(
            nn.Linear(in_features=input_dim,out_features=hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim,hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim,output_dim),
            nn.Softmax(dim=1)
            )
        
        
    def forward(self, x):
        x = self.classifier(x)
        return x

    

In [41]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64
lr = 0.001
input_dim = 9618
hidden_dim = 256
output_dim = 3
mynn = NormalNN(input_dim,hidden_dim,output_dim).to(device)
optim = Adam(mynn.parameters(),lr=lr)
loss = CrossEntropyLoss().to(device)
train_loader_tf = DataLoader(traindata_tf, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=False)
validate_loader_tf = DataLoader(valdata_tf, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=False)

In [44]:
def main(model,epochs,optim,loss,train_loader,validate_loader):
    for epoch in range(epochs):
        running_loss= 0
        validation_loss = 0 
        total_correct = 0
        for data in train_loader:
            optim.zero_grad()
            features, targets = data
            features, targets = features.to(device),targets.to(device)
            output = model(features)
            result = loss(output,targets)
            result.backward()
            optim.step()
            running_loss +=result
        with torch.no_grad():
            total_len = len(valdata_tf)
            for data in validate_loader:
                features,targets = data
                features, targets = features.to(device),targets.to(device)
                output = model(features)
                result = loss(output,targets)
                validation_loss += result
                correct_num = sum(targets.eq(output.argmax(dim=1)))
                total_correct+=correct_num
        accuracy = total_correct/total_len   
        print("epoch: ",epoch,"running loss: ", running_loss.item(),"accuracy: ",accuracy.item(),"validation loss: ",validation_loss.item())

In [45]:
main(mynn,20,optim,loss,train_loader_tf,validate_loader_tf)

epoch:  0 running loss:  197.6351318359375 accuracy:  0.6267653107643127 validation loss:  21.87410545349121
epoch:  1 running loss:  181.97772216796875 accuracy:  0.64492267370224 validation loss:  21.47779083251953
epoch:  2 running loss:  171.2307891845703 accuracy:  0.6745124459266663 validation loss:  20.64545249938965
epoch:  3 running loss:  162.8377227783203 accuracy:  0.6785473823547363 validation loss:  20.55609893798828
epoch:  4 running loss:  157.66128540039062 accuracy:  0.6765299439430237 validation loss:  20.531936645507812
epoch:  5 running loss:  153.61807250976562 accuracy:  0.6657699942588806 validation loss:  20.894866943359375
epoch:  6 running loss:  150.20263671875 accuracy:  0.6597175598144531 validation loss:  20.85379409790039
epoch:  7 running loss:  147.83615112304688 accuracy:  0.6637524962425232 validation loss:  20.86774444580078
epoch:  8 running loss:  145.4949951171875 accuracy:  0.6597175598144531 validation loss:  21.12055206298828
epoch:  9 running

In [46]:
len(valdata_tf)

1487

In [47]:
testloader = DataLoader(valdata_tf, batch_size=len(valdata_tf), shuffle=False, num_workers=0, drop_last=False)

In [50]:
with torch.no_grad():
    total_len = len(valdata_tf)
    for data in testloader:
        features,targets = data
        features, targets = features.to(device),targets.to(device)
        output = mynn(features)
        correct_num = sum(targets.eq(output.argmax(dim=1)))
acc = correct_num/total_len
acc



tensor(0.6463)

In [63]:
target_name = ["negative","neutual","positive"]

In [64]:
output = output.argmax(dim=1).numpy()

In [55]:
from sklearn.metrics import classification_report

In [67]:
print(classification_report(y_test,output))

              precision    recall  f1-score   support

           0       0.64      0.58      0.61       472
           1       0.54      0.57      0.55       508
           2       0.76      0.79      0.77       507

    accuracy                           0.65      1487
   macro avg       0.65      0.64      0.64      1487
weighted avg       0.65      0.65      0.65      1487



## NN-W2V

In [68]:
class NNW2V(nn.Module):
    def __init__(self, embedding_dim, output_dim,weights):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.embedding.requires_grad_ = False        
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        # pooled = [batch size, embedding dim]
        prediction = self.fc(x)
        return prediction

In [69]:
traindata_nn_w2v = NLPDATA(X_train_pad,y_train,feature_type=torch.int64)
valdata_nn_w2v = NLPDATA(X_test_pad,y_test,feature_type=torch.int64)

In [70]:
weights = torch.FloatTensor(embedding_matrix)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64
lr = 0.001
embedding_dim = 100
output_dim = 3
mynnw2v = NNW2V(embedding_dim,output_dim,weights).to(device)
optim = Adam(mynnw2v.parameters(),lr=lr)
loss = CrossEntropyLoss().to(device)
train_loader_nn_w2v = DataLoader(traindata_nn_w2v, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=False)
validate_loader_nn_w2v = DataLoader(valdata_nn_w2v, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=False)

In [81]:
test_loader_nn_w2v = DataLoader(valdata_nn_w2v, batch_size=len(valdata_nn_w2v), shuffle=False, num_workers=0, drop_last=False)

In [77]:
main(mynnw2v,20,optim,loss,train_loader_nn_w2v,validate_loader_nn_w2v)

epoch:  0 running loss:  168.89596557617188 accuracy:  0.6556825637817383 validation loss:  19.621353149414062
epoch:  1 running loss:  168.81578063964844 accuracy:  0.6536651253700256 validation loss:  19.70125961303711
epoch:  2 running loss:  168.74046325683594 accuracy:  0.6536651253700256 validation loss:  19.458599090576172
epoch:  3 running loss:  168.66761779785156 accuracy:  0.6550101041793823 validation loss:  19.488645553588867
epoch:  4 running loss:  168.5882110595703 accuracy:  0.6523200869560242 validation loss:  19.52179718017578
epoch:  5 running loss:  168.51644897460938 accuracy:  0.6536651253700256 validation loss:  19.345821380615234
epoch:  6 running loss:  168.44239807128906 accuracy:  0.6529926061630249 validation loss:  19.55341911315918
epoch:  7 running loss:  168.37200927734375 accuracy:  0.6509751081466675 validation loss:  19.70565414428711
epoch:  8 running loss:  168.30886840820312 accuracy:  0.6523200869560242 validation loss:  19.391353607177734
epoch:

In [83]:
with torch.no_grad():
    total_len = len(valdata_nn_w2v)
    for data in test_loader_nn_w2v:
        features,targets = data
        features, targets = features.to(device),targets.to(device)
        output = mynnw2v(features)
        correct_num = sum(targets.eq(output.argmax(dim=1)))
acc = correct_num/total_len
acc


tensor(0.6543)

In [84]:
output = output.argmax(dim=1).numpy()
print(classification_report(y_test,output))

              precision    recall  f1-score   support

           0       0.71      0.57      0.63       472
           1       0.58      0.54      0.56       508
           2       0.67      0.85      0.75       507

    accuracy                           0.65      1487
   macro avg       0.66      0.65      0.65      1487
weighted avg       0.65      0.65      0.65      1487



# RNN W2V

In [42]:
from tensorflow.keras.layers import GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D,Dropout

In [43]:
word2vec_embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                                     output_dim=embedding_matrix.shape[1], 
                                     weights=[embedding_matrix],
                                     input_length=max_length, 
                                     trainable=False)

In [44]:
#Define the RNN model 
RNN_Model = Sequential()
RNN_Model.add(word2vec_embedding_layer)
RNN_Model.add(GRU(units=64,return_sequences=True,dropout=0.1))
RNN_Model.add(GRU(units=32,dropout=0.1))
RNN_Model.add(Dense(units=3,activation="softmax"))
RNN_Model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 100)          964500    
                                                                 
 gru (GRU)                   (None, 128, 64)           31872     
                                                                 
 gru_1 (GRU)                 (None, 32)                9408      
                                                                 
 dense (Dense)               (None, 3)                 99        
                                                                 
Total params: 1,005,879
Trainable params: 41,379
Non-trainable params: 964,500
_________________________________________________________________


In [37]:
RNN_Model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
RNN_Model.fit(X_train_pad, y_train_dummy,batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23fd62ab3a0>

In [107]:
output_rnn = RNN_Model.predict(np.array(X_test))
output_rnn = np.array(output_rnn).argmax(axis=1)
print(classification_report(y_test,output_rnn))



              precision    recall  f1-score   support

           0       0.32      1.00      0.48       472
           1       0.00      0.00      0.00       508
           2       0.00      0.00      0.00       507

    accuracy                           0.32      1487
   macro avg       0.11      0.33      0.16      1487
weighted avg       0.10      0.32      0.15      1487



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#try with different padding size

In [49]:
max_length_2 = 128

In [50]:
X_train_pad_2 = pad_sequences(encoded_train_data, 
                        maxlen=max_length_2, 
                        padding='post')

In [51]:
X_test_pad_2 = pad_sequences(encoded_test_data, 
                        maxlen=max_length_2, 
                        padding='post')

In [65]:
RNN_Model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
RNN_Model.fit(X_train_pad_2, y_train_dummy,batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1aeb12dc4c0>

In [66]:
output = RNN_Model.predict(np.array(X_test_pad_2))
output = np.array(output).argmax(axis=1)
print(classification_report(y_test,output))

              precision    recall  f1-score   support

           0       0.68      0.63      0.65       472
           1       0.58      0.58      0.58       508
           2       0.75      0.80      0.77       507

    accuracy                           0.67      1487
   macro avg       0.67      0.67      0.67      1487
weighted avg       0.67      0.67      0.67      1487



# CNN W2V

In [87]:
CNN_model = Sequential()      # initilaizing the Sequential nature for CNN model
# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
CNN_model.add(word2vec_embedding_layer)
CNN_model.add(Conv1D(32, 3, padding='same', activation='relu'))
CNN_model.add(Dropout(0.5))
CNN_model.add(MaxPooling1D())
CNN_model.add(Flatten())
CNN_model.add(Dense(250, activation='relu'))
CNN_model.add(Dropout(0.5))
CNN_model.add(Dense(3, activation='softmax'))
CNN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
CNN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 576, 100)          964500    
                                                                 
 conv1d (Conv1D)             (None, 576, 32)           9632      
                                                                 
 dropout (Dropout)           (None, 576, 32)           0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 288, 32)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 9216)              0         
                                                                 
 dense (Dense)               (None, 250)               2304250   
                                                        

In [98]:
CNN_model.fit(X_train_pad, y_train_dummy,batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d7573d3f70>

In [99]:
# Report the accuracy scores for the training and test data.
loss_train, acc_train = CNN_model.evaluate(X_train_pad, y_train_dummy, verbose=0)
print('Train Accuracy: %.4f' % (acc_train*100))

loss, acc = CNN_model.evaluate(X_test_pad, y_test_dummy, verbose=0)
print('Test Accuracy: %.4f' % (acc*100))

Train Accuracy: 92.4187
Test Accuracy: 65.7700


In [104]:
output_cnn = CNN_model.predict(X_test_pad)



In [110]:
output_cnn = np.array(output_cnn).argmax(axis=1)

In [111]:
print(classification_report(y_test,output_cnn))

              precision    recall  f1-score   support

           0       0.69      0.58      0.63       472
           1       0.52      0.69      0.59       508
           2       0.83      0.71      0.76       507

    accuracy                           0.66      1487
   macro avg       0.68      0.66      0.66      1487
weighted avg       0.68      0.66      0.66      1487

