This notebook is written to extracted several features from the text and then the extracted features are used to classify them into the given classes. The implementation is performed using the spam dataset. The participants have to use different datasets given in the Dataset folder and perform the classification. 

You have to specify your own path where the code and datasets are present

In [1]:
import pandas as pd 
data =  pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label', 'message'])
data.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
text = data['message']
label = data['label']

In [3]:
label

0        ham
1        ham
2       spam
3        ham
4        ham
5       spam
6        ham
7        ham
8       spam
9       spam
10       ham
11      spam
12      spam
13       ham
14       ham
15      spam
16       ham
17       ham
18       ham
19      spam
20       ham
21       ham
22       ham
23       ham
24       ham
25       ham
26       ham
27       ham
28       ham
29       ham
        ... 
5542     ham
5543     ham
5544     ham
5545     ham
5546     ham
5547    spam
5548     ham
5549     ham
5550     ham
5551     ham
5552     ham
5553     ham
5554     ham
5555     ham
5556     ham
5557     ham
5558     ham
5559     ham
5560     ham
5561     ham
5562     ham
5563     ham
5564     ham
5565     ham
5566    spam
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [4]:
#Number of Words
#x = lambda a : a + 10
#print(x(5))
data['word_count'] = data['message'].apply(lambda x: len(str(x).split(" ")))
data[['message','word_count']].head()

Unnamed: 0,message,word_count
0,"Go until jurong point, crazy.. Available only ...",20
1,Ok lar... Joking wif u oni...,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,28
3,U dun say so early hor... U c already then say...,11
4,"Nah I don't think he goes to usf, he lives aro...",13


In [5]:
#Number of characters
data['char_count'] = data['message'].str.len() ## this also includes spaces
data[['message','char_count']].head()

Unnamed: 0,message,char_count
0,"Go until jurong point, crazy.. Available only ...",111
1,Ok lar... Joking wif u oni...,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,U dun say so early hor... U c already then say...,49
4,"Nah I don't think he goes to usf, he lives aro...",61


In [6]:
#Average Word Length
def avg_word(sentence):
  words = sentence.split()
  #print(words)
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['message'].apply(lambda x: avg_word(x))
data[['message','avg_word']].head()

Unnamed: 0,message,avg_word
0,"Go until jurong point, crazy.. Available only ...",4
1,Ok lar... Joking wif u oni...,4
2,Free entry in 2 a wkly comp to win FA Cup fina...,4
3,U dun say so early hor... U c already then say...,3
4,"Nah I don't think he goes to usf, he lives aro...",3


In [7]:
#Number of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['stopwords'] = data['message'].apply(lambda x: len([x for x in x.split() if x in stop]))
data[['message','stopwords']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  import sys


Unnamed: 0,message,stopwords
0,"Go until jurong point, crazy.. Available only ...",4
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,5
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",5


In [8]:
#Number of special characters
data['hastags'] = data['message'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[['message','hastags']].head()

Unnamed: 0,message,hastags
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
#Number of numerics
data['numerics'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[['message','numerics']].head()

Unnamed: 0,message,numerics
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
#Number of Uppercase words
data['upper'] = data['message'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data[['message','upper']].head()

Unnamed: 0,message,upper
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,U dun say so early hor... U c already then say...,2
4,"Nah I don't think he goes to usf, he lives aro...",1


In [11]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
from textblob import TextBlob, Word, Blobber
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

data['noun_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'noun'))
data['verb_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'verb'))
data['adj_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adj'))
data['adv_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'adv'))
data['pron_count'] = data['message'].apply(lambda x: check_pos_tag(x, 'pron'))
data[['message','noun_count','verb_count','adj_count', 'adv_count', 'pron_count' ]].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,message,noun_count,verb_count,adj_count,adv_count,pron_count
0,"Go until jurong point, crazy.. Available only ...",9,1,3,3,0
1,Ok lar... Joking wif u oni...,4,1,1,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,14,3,4,0,0
3,U dun say so early hor... U c already then say...,3,3,2,3,0
4,"Nah I don't think he goes to usf, he lives aro...",1,5,0,3,3


In [14]:
data[['message','word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count','label' ]].head()

Unnamed: 0,message,word_count,char_count,avg_word,stopwords,hastags,numerics,upper,noun_count,verb_count,adj_count,adv_count,pron_count,label
0,"Go until jurong point, crazy.. Available only ...",20,111,4,4,0,0,0,9,1,3,3,0,ham
1,Ok lar... Joking wif u oni...,6,29,4,0,0,0,0,4,1,1,0,0,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,28,155,4,5,0,2,2,14,3,4,0,0,spam
3,U dun say so early hor... U c already then say...,11,49,3,2,0,0,2,3,3,2,3,0,ham
4,"Nah I don't think he goes to usf, he lives aro...",13,61,3,5,0,0,1,1,5,0,3,3,ham


In [15]:
features = data[['word_count','char_count','avg_word','stopwords','hastags','numerics','upper','noun_count','verb_count','adj_count', 'adv_count', 'pron_count']]

In [16]:
#label = data['label']

import numpy as np
classes_list = ["ham","spam"]
label_index = data['label'].apply(classes_list.index)
label = np.asarray(label_index)
label

array([0, 0, 1, ..., 0, 0, 0])

In [17]:
import numpy as np
features_array = np.asarray(features)


In [18]:
features_array.shape

(5572, 12)

In [19]:
# data split into train and text
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=90)

In [20]:
x_train.shape

(3733, 12)

In [None]:

from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print("random")
print("Accuracy score =", accuracy_score(y_test, y_pred_rf))
print(metrics.classification_report(y_test, y_pred_rf))

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
y_pred_LR = LR.predict(x_test)
print("Logistic Regression")
print("Accuracy score =", accuracy_score(y_test, y_pred_LR))
print(metrics.classification_report(y_test, y_pred_LR ))

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(x_train,y_train)
y_pred_KNN = neigh.predict(x_test)
print("KNN")
print("Accuracy score =", accuracy_score(y_test, y_pred_KNN))
print(metrics.classification_report(y_test, y_pred_KNN ))

from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train,y_train)
y_pred_naive = naive.predict(x_test)
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

from sklearn.ensemble import GradientBoostingClassifier
gradient = GradientBoostingClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
gradient.fit(x_train,y_train)
y_pred_gradient = gradient.predict(x_test)
print("Gradient Boosting")
print("Accuracy score =", accuracy_score(y_test, y_pred_gradient))
print(metrics.classification_report(y_test, y_pred_gradient ))

    
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
decision.fit(x_train,y_train)
y_pred_decision = decision.predict(x_test)
print("Decision Tree")
print("Accuracy score =", accuracy_score(y_test, y_pred_decision))
print(metrics.classification_report(y_test, y_pred_decision ))
    



SVM
('Accuracy score =', 0.9445350734094616)
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      1600
           1       0.95      0.61      0.74       239

   micro avg       0.94      0.94      0.94      1839
   macro avg       0.95      0.80      0.85      1839
weighted avg       0.94      0.94      0.94      1839

random
('Accuracy score =', 0.967373572593801)
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1600
           1       0.94      0.80      0.86       239

   micro avg       0.97      0.97      0.97      1839
   macro avg       0.95      0.90      0.92      1839
weighted avg       0.97      0.97      0.97      1839

Logistic Regression
('Accuracy score =', 0.945078847199565)
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1600
           1       0.83      0.72      0.77       239

   micro avg       0.95      0



Gradient Boosting
('Accuracy score =', 0.9439912996193583)
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1600
           1       0.77      0.80      0.79       239

   micro avg       0.94      0.94      0.94      1839
   macro avg       0.87      0.88      0.88      1839
weighted avg       0.94      0.94      0.94      1839

Decision Tree
('Accuracy score =', 0.9423599782490484)
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1600
           1       0.76      0.82      0.79       239

   micro avg       0.94      0.94      0.94      1839
   macro avg       0.87      0.89      0.88      1839
weighted avg       0.94      0.94      0.94      1839



In [None]:
# data split into train and text
import numpy as np
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_array, label, test_size=0.33, random_state=42)

In [None]:
x_train.shape

(3733, 12)

In [None]:
data = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label','message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
text = data['message']
class_label = data['label']

In [None]:
import numpy as np
classes_list = ["ham","spam"]
label_index = class_label.apply(classes_list.index)
label = np.asarray(label_index)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.33, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer(ngram_range = (1,1))
#vectorizer = CountVectorizer(ngram_range = (1,1))

x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)

In [None]:
x_train.shape

(3733, 7082)

In [None]:
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'000pes',
 u'0089',
 u'0121',
 u'01223585236',
 u'01223585334',
 u'02',
 u'0207',
 u'02072069400',
 u'02073162414',
 u'02085076972',
 u'021',
 u'03',
 u'04',
 u'0430',
 u'05',
 u'050703',
 u'0578',
 u'06',
 u'07',
 u'07008009200',
 u'07046744435',
 u'07090298926',
 u'07099833605',
 u'07123456789',
 u'0721072',
 u'07732584351',
 u'07734396839',
 u'07753741225',
 u'0776xxxxxxx',
 u'07781482378',
 u'07786200117',
 u'077xxx',
 u'07801543489',
 u'07808247860',
 u'07815296484',
 u'07821230901',
 u'07880867867',
 u'07946746291',
 u'0796xxxxxx',
 u'07973788240',
 u'07xxxxxxxxx',
 u'08',
 u'0800',
 u'08000407165',
 u'08000776320',
 u'08000839402',
 u'08000930705',
 u'08000938767',
 u'08001950382',
 u'08002888812',
 u'08002986030',
 u'08002986906',
 u'08002988890',
 u'08006344447',
 u'0808',
 u'08081263000',
 u'08081560665',
 u'0825',
 u'083',
 u'0844',
 u'08448350055',
 u'08448714184',
 u'0845',
 u'08450542832',
 u'08452810071',
 u'08452810073',
 u'08452810075over18',
 u'0870

In [None]:
x_train[1,5]

0.0

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print("random")
print("Accuracy score =", accuracy_score(y_test, y_pred_rf))
print(metrics.classification_report(y_test, y_pred_rf))

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
y_pred_LR = LR.predict(x_test)
print("Logistic Regression")
print("Accuracy score =", accuracy_score(y_test, y_pred_LR))
print(metrics.classification_report(y_test, y_pred_LR ))

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(x_train,y_train)
y_pred_KNN = neigh.predict(x_test)
print("KNN")
print("Accuracy score =", accuracy_score(y_test, y_pred_KNN))
print(metrics.classification_report(y_test, y_pred_KNN ))

from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train.toarray(),y_train)
y_pred_naive = naive.predict(x_test.toarray())
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))

from sklearn.ensemble import GradientBoostingClassifier
gradient = GradientBoostingClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
gradient.fit(x_train,y_train)
y_pred_gradient = gradient.predict(x_test)
print("Gradient Boosting")
print("Accuracy score =", accuracy_score(y_test, y_pred_gradient))
print(metrics.classification_report(y_test, y_pred_gradient ))

    
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
decision.fit(x_train,y_train)
y_pred_decision = decision.predict(x_test)
print("Decision Tree")
print("Accuracy score =", accuracy_score(y_test, y_pred_decision))
print(metrics.classification_report(y_test, y_pred_decision ))
    

SVM
('Accuracy score =', 0.866231647634584)
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1593
           1       0.00      0.00      0.00       246

   micro avg       0.87      0.87      0.87      1839
   macro avg       0.43      0.50      0.46      1839
weighted avg       0.75      0.87      0.80      1839



  'precision', 'predicted', average, warn_for)


random
('Accuracy score =', 0.9820554649265906)
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1593
           1       1.00      0.87      0.93       246

   micro avg       0.98      0.98      0.98      1839
   macro avg       0.99      0.93      0.96      1839
weighted avg       0.98      0.98      0.98      1839

Logistic Regression
('Accuracy score =', 0.9864056552474171)
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1593
           1       0.99      0.91      0.95       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.99      0.95      0.97      1839
weighted avg       0.99      0.99      0.99      1839

KNN
('Accuracy score =', 0.9140837411636759)
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1593
           1       1.00      0.36      0.53       246

   micro avg       0.91     

In [21]:
import keras
from keras.models import Sequential
from keras.layers import Dense
x_train.shape

Using TensorFlow backend.


(3733, 12)

In [22]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y

encoder = LabelEncoder()
encoder.fit(y_test)
vencoded_Y = encoder.transform(y_test)
# convert integers to dummy variables (i.e. one hot encoded)
tdummy_y = np_utils.to_categorical(vencoded_Y)
tdummy_y


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [24]:
x_train.shape
#y_test

(3733, 12)

In [26]:
model = Sequential()
model.add(Dense(16, input_dim=12, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(2, activation='softmax'))
#model.add(Dense(1, activation='sigmoid'))

In [27]:

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                208       
_________________________________________________________________
dense_2 (Dense)              (None, 12)                204       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 26        
Total params: 438
Trainable params: 438
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [32]:
history = model.fit(x_train, dummy_y, epochs=1000, batch_size=64)
#history = model.fit(x_train, y_train, epochs=100, batch_size=64)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [33]:
y_pred = model.predict(x_test)
#Converting predictions to label
pred = list()
for i in range(len(y_pred)):
    pred.append(np.argmax(y_pred[i]))
pred

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,


In [34]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
print("Accuracy score =", accuracy_score(y_test, pred))
print(metrics.classification_report(y_test, pred))

('Accuracy score =', 0.9630233822729745)
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1600
           1       0.91      0.79      0.85       239

   micro avg       0.96      0.96      0.96      1839
   macro avg       0.94      0.89      0.91      1839
weighted avg       0.96      0.96      0.96      1839



In [None]:
y_pred

array([[1.1416834e-05],
       [1.3397984e-06],
       [3.3702688e-07],
       ...,
       [2.6772568e-06],
       [1.1896073e-04],
       [1.2647472e-06]], dtype=float32)