# Author Attribution
Joshua Durana rcd180001

In [1]:
import pandas as pd

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

import nltk
from nltk.corpus import stopwords

## Load Data

In [2]:
#Load CSV
federalistCSV = pd.read_csv('Data/federalist.csv')

#Set Author to Categorical
federalistCSV.author = federalistCSV.author.astype('category')

#Print Head
print(federalistCSV.head())

#Number of rows for each author
print(federalistCSV.groupby(['author']).size())

     author                                               text
0  HAMILTON  FEDERALIST. No. 1 General Introduction For the...
1       JAY  FEDERALIST No. 2 Concerning Dangers from Forei...
2       JAY  FEDERALIST No. 3 The Same Subject Continued (C...
3       JAY  FEDERALIST No. 4 The Same Subject Continued (C...
4       JAY  FEDERALIST No. 5 The Same Subject Continued (C...
author
HAMILTON                49
HAMILTON AND MADISON     3
HAMILTON OR MADISON     11
JAY                      5
MADISON                 15
dtype: int64


## Divide to Train and Test

In [3]:
#Obtain author and text columns
authors = federalistCSV.author
fedText = federalistCSV.text

#Divide to test and train
authorsTrain, authorsTest, textTrain, textTest = train_test_split(authors, fedText, test_size = .2, train_size = .8, random_state=1234)

#Print Dimensions
print("Train Dimensions:", textTrain.shape)
print("Test Dimensions:", textTest.shape)

Train Dimensions: (66,)
Test Dimensions: (17,)


## Process Text

In [12]:
#Create Vectorizer
vectorizer = TfidfVectorizer(stop_words = set(stopwords.words('english')))

#Remove Stopwords and Transform Train
textTrainV = vectorizer.fit_transform(textTrain)
textTestV = vectorizer.transform(textTest)


#Shape
print("Train Shape: ", textTrainV.shape)
print("Test Shape: ", textTestV.shape)


Train Shape:  (66, 7876)
Test Shape:  (17, 7876)


## Bernoulli Naive Bayes

In [13]:
#Create Model
fedNN = MultinomialNB()
fedNN.fit(textTrainV, authorsTrain)

#Test Model
fedNNPredict = fedNN.predict(textTestV)

#Metrics 
print(classification_report(authorsTest, fedNNPredict))

#Confusion Matrix
print(confusion_matrix(authorsTest, fedNNPredict))

                     precision    recall  f1-score   support

           HAMILTON       0.59      1.00      0.74        10
HAMILTON OR MADISON       0.00      0.00      0.00         3
                JAY       0.00      0.00      0.00         2
            MADISON       0.00      0.00      0.00         2

           accuracy                           0.59        17
          macro avg       0.15      0.25      0.19        17
       weighted avg       0.35      0.59      0.44        17

[[10  0  0  0]
 [ 3  0  0  0]
 [ 2  0  0  0]
 [ 2  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Edit Training and Test Vectors


In [40]:
#Edit Vectorizer
vectorizer = TfidfVectorizer(stop_words = set(stopwords.words('english')), max_features = 1000, ngram_range = (1,2))

#Vectorize Text
textTrainMF = vectorizer.fit_transform(textTrain)
textTestMF = vectorizer.transform(textTest)

#Shape
print("Train Shape: ", textTrainMF.shape)
print("Test Shape: ", textTestMF.shape)

Train Shape:  (66, 1000)
Test Shape:  (17, 1000)


## Naive Bayes

In [37]:
#Create Model
fedNN.fit(textTrainMF, authorsTrain)

#Predict
fedNNPredict = fedNN.predict(textTestMF)

#Metrics 
print(classification_report(authorsTest, fedNNPredict))

                     precision    recall  f1-score   support

           HAMILTON       0.59      1.00      0.74        10
HAMILTON OR MADISON       0.00      0.00      0.00         3
                JAY       0.00      0.00      0.00         2
            MADISON       0.00      0.00      0.00         2

           accuracy                           0.59        17
          macro avg       0.15      0.25      0.19        17
       weighted avg       0.35      0.59      0.44        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
