## Author Identification Algorithm: Decision Trees vs Random Forest
### BoW Approach: Word and Character based Bag Of Words

Notebook Programmers: <br>
Rommel Urbano Jr. <br>
Joshua Paulino <br>
Jeffrey Ajero <br>

In [23]:
import base64
import numpy as np
import pandas as pd

# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import re

# Other imports
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline
%matplotlib notebook

In [24]:
train=pd.read_csv('C:/Users/Asus/Desktop/train.csv')
train=train.iloc[:,[1,2]]

for i in range(0,len(train['text'])):
    train['text'][i]=re.sub('([^A-Za-z\'\ ])','',train['text'][i])

In [25]:
print(train.shape)

(19579, 2)


In [26]:
train.head()

Unnamed: 0,text,author
0,This process however afforded me no means of a...,EAP
1,It never once occurred to me that the fumbling...,HPL
2,In his left hand was a gold snuff box from whi...,EAP
3,How lovely is spring As we looked from Windsor...,MWS
4,Finding nothing else not even gold the Superin...,HPL


In [27]:
z = {'EAP': 'Edgar Allen Poe', 'MWS': 'Mary Shelley', 'HPL': 'HP Lovecraft'}
data = [go.Bar(
            x = train.author.map(z).unique(),
            y = train.author.value_counts().values,
            marker= dict(colorscale='Jet',
                         color = train.author.value_counts().values
                        ),
            text='Text entries attributed to Author'
    )]

layout = go.Layout(
    title='Target variable distribution'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [28]:
all_words = train['text'].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = all_words.index.values[2:50],
            y = all_words.values[2:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the training dataset'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [29]:
def lowercase(value):
    value = str(value)
    return value.lower()

In [30]:
def classification_accuracy(X,y):
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    from sklearn.metrics import log_loss
    results = confusion_matrix(X, y)
    print ('Confusion Matrix :')
    print(results)
    print ('Accuracy Score is',accuracy_score(X, y))
    print ('Classification Report : ')
    print (classification_report(X, y))

In [31]:
def convert_label(v):
    if v == 'EAP':
        return 0
    elif v == 'HPL':
        return 1
    elif v == 'MWS':
        return 2

In [32]:
train['author'] = train.apply(lambda x: convert_label(x['author']), axis=1)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# train['text'] = train.apply(lambda x: lowercase(x), axis=1)

count_vect = CountVectorizer(min_df=3,ngram_range=(1,3), stop_words='english')
features = count_vect.fit_transform(train.text).toarray()
labels = train.author
features.shape

X_train_ngram, X_test_ngram, y_train, y_test = train_test_split(features, labels, random_state = 0)

mnb_ngram = MultinomialNB().fit(X_train_ngram, y_train)
y_pred=mnb_ngram.predict(X_test_ngram)

print(y_train.shape)
print(X_train_ngram.shape)
print(y_test.shape)
print(X_test_ngram.shape)

print(mnb_ngram.score(X_train_ngram,y_train))
print(mnb_ngram.score(X_test_ngram,y_test))

classification_accuracy(y_test,y_pred)

(14684,)
(14684, 14762)
(4895,)
(4895, 14762)
0.9014573685644238
0.8224719101123595
Confusion Matrix :
[[1599  148  252]
 [ 159 1113  116]
 [ 115   79 1314]]
Accuracy Score is 0.8224719101123595
Classification Report : 
              precision    recall  f1-score   support

           0       0.85      0.80      0.83      1999
           1       0.83      0.80      0.82      1388
           2       0.78      0.87      0.82      1508

    accuracy                           0.82      4895
   macro avg       0.82      0.82      0.82      4895
weighted avg       0.82      0.82      0.82      4895



In [34]:
count_vect = CountVectorizer(stop_words='english')
train_counts = count_vect.fit_transform(train['text'])

X_train, X_test, y_train, y_test = train_test_split(train_counts, train['author'], random_state = 0)

mnb= MultinomialNB().fit(X_train, y_train)
y_pred=mnb.predict(X_test)

print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

print(mnb.score(X_train,y_train))
print(mnb.score(X_test,y_test))

classification_accuracy(y_test,y_pred)

(14684,)
(14684, 24773)
(4895,)
(4895, 24773)
0.9160310542086625
0.827170582226762
Confusion Matrix :
[[1622  136  241]
 [ 166 1101  121]
 [ 117   65 1326]]
Accuracy Score is 0.827170582226762
Classification Report : 
              precision    recall  f1-score   support

           0       0.85      0.81      0.83      1999
           1       0.85      0.79      0.82      1388
           2       0.79      0.88      0.83      1508

    accuracy                           0.83      4895
   macro avg       0.83      0.83      0.83      4895
weighted avg       0.83      0.83      0.83      4895



In [35]:
from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier(max_depth=10)
rand_forest.fit(X_train, y_train)
print(rand_forest.score(X_train,y_train))
print(rand_forest.score(X_test,y_test))
y_pred=rand_forest.predict(train_counts)
classification_accuracy(train['author'],y_pred)

print('With N-Grams')
rand_forest.fit(X_train_ngram, y_train)
print(rand_forest.score(X_train_ngram,y_train))
print(rand_forest.score(X_test_ngram,y_test))
y_pred=rand_forest.predict(features)
classification_accuracy(labels,y_pred)

0.4513075456278943
0.4465781409601634
Confusion Matrix :
[[7891    1    8]
 [5383  247    5]
 [5369    0  675]]
Accuracy Score is 0.4501251340722202
Classification Report : 
              precision    recall  f1-score   support

           0       0.42      1.00      0.59      7900
           1       1.00      0.04      0.08      5635
           2       0.98      0.11      0.20      6044

    accuracy                           0.45     19579
   macro avg       0.80      0.38      0.29     19579
weighted avg       0.76      0.45      0.33     19579

With N-Grams
0.4844728956687551
0.4764044943820225
Confusion Matrix :
[[7856    5   39]
 [5158  462   15]
 [4915    1 1128]]
Accuracy Score is 0.48245569232340774
Classification Report : 
              precision    recall  f1-score   support

           0       0.44      0.99      0.61      7900
           1       0.99      0.08      0.15      5635
           2       0.95      0.19      0.31      6044

    accuracy                           

In [36]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(max_depth=6)
tree.fit(X_train,y_train)
print(tree.score(X_train,y_train))
print(tree.score(X_test,y_test))
y_pred=tree.predict(X_test)
classification_accuracy(y_test,y_pred)
print('With N-Grams')
tree.fit(X_train_ngram, y_train)
print(tree.score(X_train_ngram,y_train))
print(tree.score(X_test_ngram,y_test))

y_pred=tree.predict(X_test_ngram)
classification_accuracy(y_test,y_pred)

0.46199945518932173
0.46189989785495406
Confusion Matrix :
[[1954   24   21]
 [1292   86   10]
 [1265   22  221]]
Accuracy Score is 0.46189989785495406
Classification Report : 
              precision    recall  f1-score   support

           0       0.43      0.98      0.60      1999
           1       0.65      0.06      0.11      1388
           2       0.88      0.15      0.25      1508

    accuracy                           0.46      4895
   macro avg       0.65      0.40      0.32      4895
weighted avg       0.63      0.46      0.35      4895

With N-Grams
0.46199945518932173
0.46251276813074566
Confusion Matrix :
[[1953   23   23]
 [1288   90   10]
 [1264   23  221]]
Accuracy Score is 0.46251276813074566
Classification Report : 
              precision    recall  f1-score   support

           0       0.43      0.98      0.60      1999
           1       0.66      0.06      0.12      1388
           2       0.87      0.15      0.25      1508

    accuracy                      

In [51]:
from sklearn.svm import LinearSVC
lin_svm = LinearSVC(C=0.1)
lin_svm.fit(X_train,y_train)
print(lin_svm.score(X_train,y_train))
print(lin_svm.score(X_test,y_test))
y_pred=lin_svm.predict(X_test)
classification_accuracy(y_test,y_pred)

print('With N-Grams')
lin_svm.fit(X_train_ngram, y_train)
print(lin_svm.score(X_train_ngram,y_train))
print(lin_svm.score(X_test_ngram,y_test))

y_pred=lin_svm.predict(X_test_ngram)
classification_accuracy(y_test,y_pred)

0.9669708526287115
0.802247191011236
Confusion Matrix :
[[1729   98  172]
 [ 267 1027   94]
 [ 252   85 1171]]
Accuracy Score is 0.802247191011236
Classification Report : 
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      1999
           1       0.85      0.74      0.79      1388
           2       0.81      0.78      0.80      1508

    accuracy                           0.80      4895
   macro avg       0.81      0.79      0.80      4895
weighted avg       0.81      0.80      0.80      4895

With N-Grams
0.9644511032416235
0.8040858018386108
Confusion Matrix :
[[1719  101  179]
 [ 262 1033   93]
 [ 241   83 1184]]
Accuracy Score is 0.8040858018386108
Classification Report : 
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      1999
           1       0.85      0.74      0.79      1388
           2       0.81      0.79      0.80      1508

    accuracy                           0.8

In [38]:
# from sklearn.svm import SVC
# svclassifier = SVC(kernel='poly', degree=3)
# svclassifier.fit(X_train, y_train)
# print(svclassifier.score(X_train,y_train))
# print(svclassifier.score(X_test,y_test))
# print('With N-Grams')
# model.fit(X_train_ngram, y_train)
# print(svclassifier.score(X_train_ngram,y_train))
# print(svclassifier.score(X_test_ngram,y_test))

In [39]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=215)
neigh.fit(X_train, y_train)
print(neigh.score(X_train,y_train))
print(neigh.score(X_test,y_test))
y_pred=neigh.predict(X_test)
classification_accuracy(y_test,y_pred)

# print('With N-Grams')
# neigh.fit(X_train_ngram, y_train)
# print(neigh.score(X_train_ngram,y_train))
# print(neigh.score(X_test_ngram,y_test))
# y_pred=neigh.predict(X_test_ngram)
# classification_accuracy(y_test,y_pred)

0.4024788885862163
0.4083758937691522
Confusion Matrix :
[[1998    0    1]
 [1388    0    0]
 [1507    0    1]]
Accuracy Score is 0.4083758937691522
Classification Report : 
              precision    recall  f1-score   support

           0       0.41      1.00      0.58      1999
           1       0.00      0.00      0.00      1388
           2       0.50      0.00      0.00      1508

    accuracy                           0.41      4895
   macro avg       0.30      0.33      0.19      4895
weighted avg       0.32      0.41      0.24      4895




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [40]:
from sklearn.ensemble import StackingClassifier
estimators = [
     ('lin_svc', LinearSVC()),
     ('mnb', MultinomialNB())
]
stack = StackingClassifier(
     estimators=estimators, final_estimator=LinearSVC()
)

stack.fit(X_train,y_train)
print(stack.score(X_train,y_train))
print(stack.score(X_test,y_test))
y_pred=stack.predict(X_test)
classification_accuracy(y_test,y_pred)

print('With N-Gram')
stack.fit(X_train_ngram,y_train)
print(stack.score(X_train_ngram,y_train))
print(stack.score(X_test_ngram,y_test))
y_pred=stack.predict(X_test_ngram)
classification_accuracy(y_test,y_pred)



Liblinear failed to converge, increase the number of iterations.



0.9266548624353037
0.8343207354443309
Confusion Matrix :
[[1669  133  197]
 [ 181 1106  101]
 [ 134   65 1309]]
Accuracy Score is 0.8343207354443309
Classification Report : 
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      1999
           1       0.85      0.80      0.82      1388
           2       0.81      0.87      0.84      1508

    accuracy                           0.83      4895
   macro avg       0.83      0.83      0.83      4895
weighted avg       0.83      0.83      0.83      4895

With N-Gram



Liblinear failed to converge, increase the number of iterations.



0.9136475074911469
0.8318692543411644
Confusion Matrix :
[[1657  137  205]
 [ 167 1117  104]
 [ 138   72 1298]]
Accuracy Score is 0.8318692543411644
Classification Report : 
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      1999
           1       0.84      0.80      0.82      1388
           2       0.81      0.86      0.83      1508

    accuracy                           0.83      4895
   macro avg       0.83      0.83      0.83      4895
weighted avg       0.83      0.83      0.83      4895

