In [68]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
# Import the data we just downloaded and installed.
from nltk.corpus import gutenberg, stopwords

# Grab and process the raw data.
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [22]:
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
persuasion = gutenberg.raw('austen-persuasion.txt')
# Print the first 100 characters of shakespeare-macbeth.
print('\nRaw:\n', macbeth[0:100])


Raw:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig


### Data cleaning / Processing

In [23]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
macbeth = text_cleaner(macbeth)
persuasion = text_cleaner(persuasion)

In [24]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
macbeth_doc = nlp(macbeth)
persuasion_doc = nlp(persuasion)

In [25]:
# Group into sentences.
macbeth_sents = [[sent, "shakespeare"] for sent in macbeth_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(macbeth_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Actus, Primus, .)",shakespeare
1,"(Scoena, Prima, .)",shakespeare
2,"(Thunder, and, Lightning, .)",shakespeare
3,"(Enter, three, Witches, .)",shakespeare
4,"(1, .)",shakespeare


In [26]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [63]:
macbeth_corpus = []
macbeth_corpus = sentences[0]
macbeth_corpus = np.array(macbeth_corpus)

macbeth_new_corpus = []
for item in macbeth_corpus:
    macbeth_new_corpus.append(str(item))
    
macbeth_norm_corpus = []

for doc in macbeth_new_corpus:
    macbeth_norm_corpus.append(normalize_corpus(doc))
    
macbeth_norm_new_corpus = []
for item in macbeth_norm_corpus:
    macbeth_norm_new_corpus.append(item.tolist())

## Features creation using NLP methods
### 1. BoW (Bag of Words)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

cv =  CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(macbeth_norm_new_corpus)
cv_matrix = cv_matrix.toarray()
vocab = cv.get_feature_names()
# show document feature vectors
macbeth_cv_matrix = pd.DataFrame(cv_matrix, columns=vocab)

In [29]:
macbeth_cv_matrix
macbeth_cv_matrix['text_sentence'] = sentences[0]
macbeth_cv_matrix['text_source'] = sentences[1]

In [None]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

Y = macbeth_cv_matrix['text_source']
X = np.array(macbeth_cv_matrix.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

### BoW with RandomForest

In [30]:
rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9907352062163778

Test set score: 0.9094982078853047


In [31]:
cross_val_score(rfc, X_train, y_train, cv=5)

array([0.90597015, 0.89387145, 0.89088191, 0.89536622, 0.89835575])

### BoW with Logistic Regression

In [32]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3346, 8054) (3346,)
Training set score: 0.9856545128511656

Test set score: 0.9413082437275986


In [33]:
cross_val_score(lr, X_train, y_train, cv=5)

array([0.94029851, 0.94020927, 0.94170404, 0.92376682, 0.92376682])

### BoW with GradientBoosting

In [34]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8550508069336521

Test set score: 0.842741935483871


In [35]:
cross_val_score(clf, X_train, y_train, cv=5)

array([0.81044776, 0.83408072, 0.84304933, 0.82511211, 0.82959641])

### BoW with LinearSVC

In [36]:
model = LinearSVC()
train = model.fit(X_train, y_train)

print('Training set score:', model.score(X_train, y_train))
print('\nTest set score:', model.score(X_test, y_test))

Training set score: 0.9925283921099821

Test set score: 0.9341397849462365


In [37]:
cross_val_score(model, X_train, y_train, cv=5)

array([0.92537313, 0.93124066, 0.93273543, 0.92077728, 0.91330344])

### 2. tfidf

In [41]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
macbeth_norm_new_corpus_tfidf=vectorizer.fit_transform(macbeth_norm_new_corpus)
print("Number of features: %d" % macbeth_norm_new_corpus_tfidf.get_shape()[1])


Number of features: 3872


In [51]:
tfid_matrix = macbeth_norm_new_corpus_tfidf.toarray()
tfid_vocab = vectorizer.get_feature_names()
# show document feature vectors
macbeth_tfid_matrix = pd.DataFrame(tfid_matrix, columns=tfid_vocab)

In [54]:
macbeth_tfid_matrix['text_sentence'] = sentences[0]
macbeth_tfid_matrix['text_source'] = sentences[1]

In [75]:
Y = macbeth_tfid_matrix['text_source']
X = np.array(macbeth_tfid_matrix.drop(['text_sentence','text_source'], 1))

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

### tfidf with RandomForest

In [55]:
rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', rfc.score(X_test_tfidf, y_test_tfidf))

Training set score: 0.982068141063957

Test set score: 0.8987455197132617


In [56]:
cross_val_score(rfc, X_train_tfidf, y_train_tfidf, cv=5)

array([0.90149254, 0.877429  , 0.89536622, 0.88938714, 0.86995516])

### tfidf with Logistic Regression

In [57]:
lr = LogisticRegression()
train = lr.fit(X_train_tfidf, y_train_tfidf)
print(X_train_tfidf.shape, y_train_tfidf.shape)
print('Training set score:', lr.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', lr.score(X_test_tfidf, y_test_tfidf))

(3346, 3872) (3346,)
Training set score: 0.9497907949790795

Test set score: 0.9081541218637993


In [58]:
cross_val_score(lr, X_train_tfidf, y_train_tfidf, cv=5)

array([0.9       , 0.89088191, 0.89686099, 0.86248132, 0.89835575])

### tfidf with GradientBoosting

In [59]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', clf.score(X_test_tfidf, y_test_tfidf))

Training set score: 0.8475791990436342

Test set score: 0.8445340501792115


In [60]:
cross_val_score(clf, X_train_tfidf, y_train_tfidf, cv=5)

array([0.81940299, 0.82959641, 0.82810164, 0.8161435 , 0.8161435 ])

### tfidf with LinearSVC

In [61]:
model = LinearSVC()
train = model.fit(X_train_tfidf, y_train_tfidf)

print('Training set score:', model.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', model.score(X_test_tfidf, y_test_tfidf))

Training set score: 0.9748953974895398

Test set score: 0.9332437275985663


In [62]:
cross_val_score(model, X_train_tfidf, y_train_tfidf, cv=5)

array([0.91791045, 0.92376682, 0.92077728, 0.90284006, 0.92227205])

so we have observed that BoW with Logistic Regression performed better compared to others while tfid with LinearSVC performed better compared to other models

### Model Tunning

Lets pick BoW with Logistic Regression as it gives best performance compared to other models after assessing cross validation score. Lets do some tunning to check if we can get some improved accuracy 

In [69]:
# A parameter grid for LogisticRegression
params = {'C':[1,10], 'penalty':['l1','l2']}
lr = LogisticRegression()

# Initialize LogisticRegression and GridSearch
grid = GridSearchCV(lr, params, cv=3)
grid.fit(X_train_tfidf, y_train_tfidf)
best_lr = grid.best_estimator_
score = accuracy_score(y_true=y_test_tfidf, y_pred=best_lr.predict(X_test_tfidf))
print('Accuracy on testset:\t{:.4f}\n'.format(score))

Accuracy on testset:	0.9346



In [70]:
grid.best_params_

{'C': 10, 'penalty': 'l2'}

In [71]:
# cross_val_score
cross_val_score(best_lr, X_train_tfidf, y_train_tfidf, cv=5)

array([0.92537313, 0.92526158, 0.92227205, 0.90433483, 0.92526158])

### LSA

In [81]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
Data_lsa= lsa.fit_transform(X)
# Split training testing data
X_train_lsa, X_test_lsa, y_train_lsa, y_test_lsa = train_test_split(Data_lsa, Y, random_state=3)

In [83]:
lr = LogisticRegression()
train = lr.fit(X_train_lsa, y_train_lsa)
print(X_train_lsa.shape, y_train_lsa.shape)
print('Training set score:', lr.score(X_train_lsa, y_train_lsa))
print('\nTest set score:', lr.score(X_test_lsa, y_test_lsa))

(4183, 130) (4183,)
Training set score: 0.9335405211570643

Test set score: 0.9247311827956989


In [85]:
cross_val_score(lr, X_train_lsa, y_train_lsa, cv=5)

array([0.91288783, 0.93548387, 0.9222488 , 0.91985646, 0.93301435])

- As we have analyse the data above Bow with Logistic Regression worked very well and gave much better performance compared to applying LSA on the data.
- As this shows that we are loosing information after performing LSA 
- Tfidf also worked well and LinearSVC is giving best permormance compared to other model with Tfidf data
- So Logistic Regression is the better performing model with good cross validation scores