In [54]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize

In [2]:
nlp = spacy.load('en')

In [3]:
asv = pd.read_csv("t_asv.csv")

In [4]:
asv["Author"].unique()

array(['Matthew', 'Mark', 'Luke', 'John', 'Paul', 'Unknown', 'James',
       'Peter', 'Jude'], dtype=object)

In [5]:
asv_Matthew = asv[asv["Author"] == "Matthew"]
asv_Mark = asv[asv["Author"] == "Mark"]
asv_Luke = asv[asv["Author"] == "Luke"]
asv_John = asv[asv["Author"] == "John"]
asv_Paul = asv[asv["Author"] == "Paul"]
asv_Unknown = asv[asv["Author"] == "Unknown"]
asv_James = asv[asv["Author"] == "James"]
asv_Peter = asv[asv["Author"] == "Peter"]
asv_Jude = asv[asv["Author"] == "Jude"]
    

In [6]:
asv_Matthew_doc = nlp(''.join(asv_Matthew["Text"]))
asv_Matthew_sents = [[sent, "Matthew"] for sent in asv_Matthew_doc.sents]

asv_Mark_doc = nlp(''.join(asv_Mark["Text"]))
asv_Mark_sents = [[sent, "Mark"] for sent in asv_Mark_doc.sents]

asv_Luke_doc = nlp(''.join(asv_Luke["Text"]))
asv_Luke_sents = [[sent, "Luke"] for sent in asv_Luke_doc.sents]

asv_John_doc = nlp(''.join(asv_John["Text"]))
asv_John_sents = [[sent, "John"] for sent in asv_John_doc.sents]

asv_Paul_doc = nlp(''.join(asv_Paul["Text"]))
asv_Paul_sents = [[sent, "Paul"] for sent in asv_Paul_doc.sents]

asv_Unknown_doc = nlp(''.join(asv_Unknown["Text"]))
asv_Unknown_sents = [[sent, "Unknown"] for sent in asv_Unknown_doc.sents]

asv_James_doc = nlp(''.join(asv_James["Text"]))
asv_James_sents = [[sent, "James"] for sent in asv_James_doc.sents]

asv_Peter_doc = nlp(''.join(asv_Peter["Text"]))
asv_Peter_sents = [[sent, "Peter"] for sent in asv_Peter_doc.sents]

asv_Jude_doc = nlp(''.join(asv_Jude["Text"]))
asv_Jude_sents = [[sent, "Jude"] for sent in asv_Jude_doc.sents]

In [7]:
df2 = pd.DataFrame(asv_Matthew_sents + asv_Mark_sents + asv_Luke_sents + asv_John_sents + asv_Paul_sents + asv_Unknown_sents
                  + asv_James_sents + asv_Peter_sents + asv_Jude_sents)

In [8]:
df2.head()

Unnamed: 0,0,1
0,"(The, book, of, the, generation, of, Jesus, Ch...",Matthew
1,"(Abraham, begat, Isaac, ;, and, Isaac, begat, ...",Matthew
2,"(Jesse, begat)",Matthew
3,"(David, the, king, .)",Matthew
4,"(And, David, begat, Solomon, of, her, `, that,...",Matthew


# TF-IDF

In [84]:
from sklearn.model_selection import train_test_split

X = asv["Text"]
y = asv["Author"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [121]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import ensemble

# Naive Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

#Logistic Regression
text_clf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

#Logistic Regression
text_clf_rfc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', ensemble.RandomForestClassifier(n_estimators=500, max_depth=5)),
])


## Naive Bayes

In [88]:
#run Naive Bayes first

text_clf_nb.fit(X_train, y_train)

#run predictions
prednb = text_clf_nb.predict(X_test)


In [89]:
df3 = pd.DataFrame(metrics.confusion_matrix(y_test,prednb), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
df3

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,0,1,0,5,0,0,17,0,0
John,0,146,0,153,0,0,71,0,0
Jude,0,0,0,2,0,0,4,0,0
Luke,0,9,0,472,0,2,56,0,0
Mark,0,10,0,131,0,0,16,0,0
Matthew,0,15,0,219,0,7,25,0,0
Paul,0,4,0,48,0,0,455,0,0
Peter,0,1,0,11,0,0,33,0,0
Unknown,0,6,0,36,0,0,35,0,0


In [90]:
#sns.heatmap(df3, cmap='Blues',annot=True, )

In [91]:
print(metrics.classification_report(y_test,prednb))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.00      0.00      0.00        23
        John       0.76      0.39      0.52       370
        Jude       0.00      0.00      0.00         6
        Luke       0.44      0.88      0.58       539
        Mark       0.00      0.00      0.00       157
     Matthew       0.78      0.03      0.05       266
        Paul       0.64      0.90      0.75       507
       Peter       0.00      0.00      0.00        45
     Unknown       0.00      0.00      0.00        77

   micro avg       0.54      0.54      0.54      1990
   macro avg       0.29      0.24      0.21      1990
weighted avg       0.53      0.54      0.45      1990



In [92]:
print(metrics.accuracy_score(y_test,prednb))

0.542713567839196


In [93]:
cross_val_score(text_clf_nb, X_test, y_test, cv=10)



array([0.48019802, 0.5       , 0.53465347, 0.53233831, 0.46268657,
       0.49      , 0.54040404, 0.4974359 , 0.52307692, 0.54639175])

## Linear SVC

In [94]:
#let's run linear SVC next
text_clf_lsvc.fit(X_train, y_train)
#predictions
predsvc = text_clf_lsvc.predict(X_test)

In [95]:
df4 = pd.DataFrame(metrics.confusion_matrix(y_test,predsvc), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
df4

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,3,0,0,6,1,1,12,0,0
John,0,275,0,29,10,22,32,0,2
Jude,0,0,0,1,0,1,2,2,0
Luke,0,42,0,353,43,51,48,0,2
Mark,0,17,0,51,35,45,9,0,0
Matthew,0,28,0,73,53,90,20,1,1
Paul,0,22,0,34,5,12,429,1,4
Peter,0,5,0,6,0,1,26,5,2
Unknown,0,4,0,15,1,3,21,1,32


In [96]:
print(metrics.classification_report(y_test,predsvc))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       1.00      0.13      0.23        23
        John       0.70      0.74      0.72       370
        Jude       0.00      0.00      0.00         6
        Luke       0.62      0.65      0.64       539
        Mark       0.24      0.22      0.23       157
     Matthew       0.40      0.34      0.37       266
        Paul       0.72      0.85      0.78       507
       Peter       0.50      0.11      0.18        45
     Unknown       0.74      0.42      0.53        77

   micro avg       0.61      0.61      0.61      1990
   macro avg       0.55      0.38      0.41      1990
weighted avg       0.60      0.61      0.60      1990



In [97]:
print(metrics.accuracy_score(y_test,predsvc))

0.614070351758794


In [98]:
cross_val_score(text_clf_lsvc, X_test, y_test, cv=10)



array([0.53960396, 0.56930693, 0.58415842, 0.61691542, 0.55721393,
       0.585     , 0.59090909, 0.60512821, 0.58461538, 0.54639175])

## Logistic Regression

In [99]:
#run Naive Bayes first
text_clf_lr.fit(X_train, y_train)

#run predictions
predlr = text_clf_lr.predict(X_test)




In [100]:
df5 = pd.DataFrame(metrics.confusion_matrix(y_test,predlr), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
df5

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,0,1,0,5,0,2,15,0,0
John,0,249,0,62,4,11,44,0,0
Jude,0,0,0,2,0,0,4,0,0
Luke,0,36,0,403,7,28,65,0,0
Mark,0,18,0,84,14,26,15,0,0
Matthew,0,38,0,104,14,74,36,0,0
Paul,0,15,0,41,2,6,443,0,0
Peter,0,1,0,8,0,1,35,0,0
Unknown,0,9,0,24,0,0,37,0,7


In [101]:
print(metrics.classification_report(y_test,predlr))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.00      0.00      0.00        23
        John       0.68      0.67      0.68       370
        Jude       0.00      0.00      0.00         6
        Luke       0.55      0.75      0.63       539
        Mark       0.34      0.09      0.14       157
     Matthew       0.50      0.28      0.36       266
        Paul       0.64      0.87      0.74       507
       Peter       0.00      0.00      0.00        45
     Unknown       1.00      0.09      0.17        77

   micro avg       0.60      0.60      0.60      1990
   macro avg       0.41      0.31      0.30      1990
weighted avg       0.57      0.60      0.55      1990



In [112]:
print(metrics.accuracy_score(y_test,predlr))

0.5979899497487438


In [103]:
cross_val_score(text_clf_lr, X_test, y_test, cv=10)



array([0.47524752, 0.54455446, 0.54950495, 0.53233831, 0.48756219,
       0.505     , 0.57575758, 0.54358974, 0.55897436, 0.54639175])

## Random Forest

In [122]:
text_clf_rfc.fit(X_train, y_train)

predrfc = text_clf_rfc.predict(X_test)

In [123]:
dfrfc = pd.DataFrame(metrics.confusion_matrix(y_test,predrfc), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
dfrfc

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,0,0,0,5,0,0,18,0,0
John,0,0,0,273,0,0,97,0,0
Jude,0,0,0,2,0,0,4,0,0
Luke,0,0,0,450,0,0,89,0,0
Mark,0,0,0,137,0,0,20,0,0
Matthew,0,0,0,208,0,0,58,0,0
Paul,0,0,0,86,0,0,421,0,0
Peter,0,0,0,13,0,0,32,0,0
Unknown,0,0,0,36,0,0,41,0,0


In [124]:
print(metrics.classification_report(y_test,predrfc))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.00      0.00      0.00        23
        John       0.00      0.00      0.00       370
        Jude       0.00      0.00      0.00         6
        Luke       0.37      0.83      0.51       539
        Mark       0.00      0.00      0.00       157
     Matthew       0.00      0.00      0.00       266
        Paul       0.54      0.83      0.65       507
       Peter       0.00      0.00      0.00        45
     Unknown       0.00      0.00      0.00        77

   micro avg       0.44      0.44      0.44      1990
   macro avg       0.10      0.19      0.13      1990
weighted avg       0.24      0.44      0.31      1990



In [125]:
print(metrics.accuracy_score(y_test,predrfc))

0.4376884422110553


In [126]:
cross_val_score(text_clf_rfc, X_test, y_test, cv=10)



array([0.38118812, 0.42079208, 0.42574257, 0.4278607 , 0.39303483,
       0.41      , 0.45959596, 0.45128205, 0.44102564, 0.45876289])

# Bag Of Words

In [21]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df



In [30]:
ntdoc = nlp(''.join(asv["Text"]))

In [31]:
# Set up the bags.
ntwords = bag_of_words(ntdoc)

In [34]:
# Create our data frame with features.
word_counts = bow_features(df2, ntwords)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900
Processing row 950
Processing row 1000
Processing row 1050
Processing row 1100
Processing row 1150
Processing row 1200
Processing row 1250
Processing row 1300
Processing row 1350
Processing row 1400
Processing row 1450
Processing row 1500
Processing row 1550
Processing row 1600
Processing row 1650
Processing row 1700
Processing row 1750
Processing row 1800
Processing row 1850
Processing row 1900
Processing row 1950
Processing row 2000
Processing row 2050
Processing row 2100
Processing row 2150
Processing row 2200
Processing row 2250
Processing row 2300
Processing row 2350
Processing row 2400
Processing row 2450
Processing row 2500
Pro

Unnamed: 0,-PRON-,and,unto,ye,say,`,shall,god,man,come,...,zebulun,naphtali,decapolis,bushel,divorcement,perform,black,smiteth,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, book, of, the, generation, of, Jesus, Ch...",Matthew
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Abraham, begat, Isaac, ;, and, Isaac, begat, ...",Matthew
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Jesse, begat)",Matthew
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(David, the, king, .)",Matthew
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, David, begat, Solomon, of, her, `, that,...",Matthew


In [127]:
Y = word_counts['text_source']
X = normalize(np.array(word_counts.drop(['text_sentence','text_source'], 1)))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25
                                                    )


#print('Training set score:', rfc.score(X_train, y_train))
#print('\nTest set score:', rfc.score(X_test, y_test))

## Naive Bayes

In [56]:
#run Naive Bayes first
nb = MultinomialNB()

nb.fit(X_train, y_train)

#run predictions
prednb = nb.predict(X_test)

In [57]:
dfbownb = pd.DataFrame(metrics.confusion_matrix(y_test,prednb), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
dfbownb

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,0,0,0,6,0,0,20,0,0
John,0,215,0,99,1,0,50,0,0
Jude,0,1,0,1,0,0,5,0,0
Luke,0,22,0,430,0,5,27,0,0
Mark,0,23,0,131,1,3,15,0,0
Matthew,0,44,0,190,3,17,27,0,0
Paul,0,17,0,55,0,1,325,0,0
Peter,0,3,0,5,0,0,26,0,0
Unknown,0,11,0,17,0,1,37,0,0


In [58]:
print(metrics.classification_report(y_test,prednb))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.00      0.00      0.00        26
        John       0.64      0.59      0.61       365
        Jude       0.00      0.00      0.00         7
        Luke       0.46      0.89      0.61       484
        Mark       0.20      0.01      0.01       173
     Matthew       0.63      0.06      0.11       281
        Paul       0.61      0.82      0.70       398
       Peter       0.00      0.00      0.00        34
     Unknown       0.00      0.00      0.00        66

   micro avg       0.54      0.54      0.54      1834
   macro avg       0.28      0.26      0.23      1834
weighted avg       0.50      0.54      0.45      1834



In [59]:
print(metrics.accuracy_score(y_test,prednb))

0.5387131952017448


In [60]:
cross_val_score(nb, X_test, y_test, cv=10)



array([0.48404255, 0.55614973, 0.47593583, 0.47311828, 0.50543478,
       0.4863388 , 0.48066298, 0.48333333, 0.46927374, 0.4972067 ])

## Linear SVC

In [61]:
#run Naive Bayes first
svc = LinearSVC()

svc.fit(X_train, y_train)

#run predictions
predsvc = svc.predict(X_test)

In [62]:
dfbowsvc = pd.DataFrame(metrics.confusion_matrix(y_test,predsvc), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
dfbowsvc

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,1,4,0,5,0,1,15,0,0
John,0,267,0,41,7,10,38,1,1
Jude,0,1,0,1,0,0,3,2,0
Luke,2,42,0,334,18,59,28,0,1
Mark,0,24,0,64,31,36,18,0,0
Matthew,0,42,0,82,41,89,26,0,1
Paul,0,22,0,32,3,5,333,1,2
Peter,1,3,0,3,1,2,23,1,0
Unknown,0,13,0,10,1,3,22,0,17


In [63]:
print(metrics.classification_report(y_test,predsvc))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.25      0.04      0.07        26
        John       0.64      0.73      0.68       365
        Jude       0.00      0.00      0.00         7
        Luke       0.58      0.69      0.63       484
        Mark       0.30      0.18      0.23       173
     Matthew       0.43      0.32      0.37       281
        Paul       0.66      0.84      0.74       398
       Peter       0.20      0.03      0.05        34
     Unknown       0.77      0.26      0.39        66

   micro avg       0.59      0.59      0.59      1834
   macro avg       0.43      0.34      0.35      1834
weighted avg       0.55      0.59      0.56      1834



In [64]:
print(metrics.accuracy_score(y_test,predsvc))

0.5850599781897492


In [65]:
cross_val_score(svc, X_test, y_test, cv=10)



array([0.55319149, 0.52941176, 0.49197861, 0.51612903, 0.54347826,
       0.53005464, 0.54696133, 0.55555556, 0.51396648, 0.55865922])

## Logistic Regression

In [66]:
#run Naive Bayes first
lr = LogisticRegression()

lr.fit(X_train, y_train)

#run predictions
predlr = lr.predict(X_test)



In [67]:
dfbowlr = pd.DataFrame(metrics.confusion_matrix(y_test,predlr), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
dfbowlr

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,0,3,0,4,0,1,18,0,0
John,0,239,0,60,3,7,56,0,0
Jude,0,1,0,2,0,0,4,0,0
Luke,0,31,0,368,5,36,44,0,0
Mark,0,25,0,84,19,24,21,0,0
Matthew,0,46,0,124,15,63,32,0,1
Paul,0,26,0,34,0,2,336,0,0
Peter,0,3,0,6,0,1,24,0,0
Unknown,0,12,0,13,0,2,33,0,6


In [68]:
print(metrics.classification_report(y_test,predlr))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.00      0.00      0.00        26
        John       0.62      0.65      0.64       365
        Jude       0.00      0.00      0.00         7
        Luke       0.53      0.76      0.62       484
        Mark       0.45      0.11      0.18       173
     Matthew       0.46      0.22      0.30       281
        Paul       0.59      0.84      0.70       398
       Peter       0.00      0.00      0.00        34
     Unknown       0.86      0.09      0.16        66

   micro avg       0.56      0.56      0.56      1834
   macro avg       0.39      0.30      0.29      1834
weighted avg       0.54      0.56      0.51      1834



In [69]:
print(metrics.accuracy_score(y_test,predlr))

0.5621592148309705


In [70]:
cross_val_score(lr, X_test, y_test, cv=10)



array([0.5212766 , 0.55614973, 0.53475936, 0.51612903, 0.55434783,
       0.46448087, 0.53038674, 0.53333333, 0.5027933 , 0.53072626])

## Random Forest

In [133]:
rfc = ensemble.RandomForestClassifier(n_estimators=500, max_depth=6)

rfc.fit(X_train, y_train)

predrfc2 = rfc.predict(X_test)

In [134]:
dfbowrfc = pd.DataFrame(metrics.confusion_matrix(y_test,predrfc2), 
      index=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'], 
     columns=['James','John','Jude',
            'Luke','Mark','Matthew','Paul',
            'Peter','Unknown'])
dfbowrfc

Unnamed: 0,James,John,Jude,Luke,Mark,Matthew,Paul,Peter,Unknown
James,0,0,0,14,0,0,20,0,0
John,0,25,0,312,0,0,41,0,0
Jude,0,0,0,6,0,0,0,0,0
Luke,0,1,0,465,0,0,28,0,0
Mark,0,0,0,159,0,0,5,0,0
Matthew,0,0,0,235,0,0,17,0,0
Paul,0,2,0,187,0,0,224,0,0
Peter,0,0,0,15,0,0,15,0,0
Unknown,0,0,0,41,0,0,22,0,0


In [135]:
print(metrics.classification_report(y_test,predrfc2))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       James       0.00      0.00      0.00        34
        John       0.89      0.07      0.12       378
        Jude       0.00      0.00      0.00         6
        Luke       0.32      0.94      0.48       494
        Mark       0.00      0.00      0.00       164
     Matthew       0.00      0.00      0.00       252
        Paul       0.60      0.54      0.57       413
       Peter       0.00      0.00      0.00        30
     Unknown       0.00      0.00      0.00        63

   micro avg       0.39      0.39      0.39      1834
   macro avg       0.20      0.17      0.13      1834
weighted avg       0.41      0.39      0.28      1834



In [136]:
print(metrics.accuracy_score(y_test,predrfc2))

0.3893129770992366


In [141]:
cross_val_score(rfc, X_test, y_test, cv=10)



array([0.40957447, 0.36170213, 0.40641711, 0.37297297, 0.43956044,
       0.39010989, 0.44198895, 0.42541436, 0.41666667, 0.41666667])