In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


symbolic link created for C:\ProgramData\Anaconda3\lib\site-packages\spacy\data\en <<===>> C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[+] Linking successful
C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm -->
C:\ProgramData\Anaconda3\lib\site-packages\spacy\data\en
You can now load the model via spacy.load('en')


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

### 1) Converting words or sentences into numeric vectors is fundamental when working with text data. To make sure you are solid on how these vectors work, please generate the tf-idf vectors for the last three sentences of the example we gave at the beginning of this checkpoint.

In [3]:
last_sentences = ["The Lumberjack Song is the funniest Monty Python bit: I can't think of it without laughing.",
"I would rather put strawberries on my ice cream for dessert, they have the best taste.",
"The taste of caramel is a fantastic accompaniment to tasty mint ice cream."]

In [4]:
nlp = spacy.load('en')
last_sentences_doc = nlp(' '.join(last_sentences))

In [5]:
last_sents = [[sent] for sent in last_sentences_doc.sents]

sentences = pd.DataFrame(last_sents, columns=['text'])
for i, sentence in enumerate(sentences['text']):
    sentences.loc[i, 'text'] = ' '.join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

In [6]:
vectorizer = TfidfVectorizer(
    use_idf=True, norm=u'l2', smooth_idf=True)

# applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_df.head()

Unnamed: 0,accompaniment,bit,caramel,cream,dessert,fantastic,funniest,good,ice,laugh,lumberjack,mint,monty,python,song,strawberry,taste,tasty,think
0,0.0,0.353553,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.353553,0.353553,0.0,0.353553,0.353553,0.353553,0.0,0.0,0.0,0.353553
1,0.0,0.0,0.0,0.349498,0.459548,0.0,0.0,0.459548,0.349498,0.0,0.0,0.0,0.0,0.0,0.0,0.459548,0.349498,0.0,0.0
2,0.385323,0.0,0.385323,0.293048,0.0,0.385323,0.0,0.0,0.293048,0.0,0.0,0.385323,0.0,0.0,0.0,0.0,0.293048,0.385323,0.0


### 2) In the 2-grams example, we only used 2-grams as our features. This time, use both 1-grams and 2-grams together as your feature set. Run the same models in the example and compare the results.

In [7]:
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [8]:
# Load data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Get rid of chapter headers by using regular expressions
persuasion = re.sub(r'Chapter \d+','', persuasion)
alice = re.sub(r'CHAPTER .*','', alice)

# Use text cleaning function defined above
persuasion = text_cleaner(persuasion)
alice = text_cleaner(alice)

In [9]:
# parse the cleaned novels
nlp = spacy.load('en')
persuasion_doc = nlp(persuasion)
alice_doc = nlp(alice)

In [10]:
# make lists of sentences and the author name
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]

# put lists into dataframe
sentences = pd.DataFrame(persuasion_sents + alice_sents, columns=['text', 'author'])
sentences.head()

Unnamed: 0,text,author
0,"(Sir, Walter, Elliot, ,, of, Kellynch, Hall, ,...",Austen
1,"(This, was, the, page, at, which, the, favouri...",Austen
2,"("")",Austen
3,"(Walter, Elliot, ,, born, March, ,, ,, married...",Austen
4,"(Anne, ,, born, August, ,, ;, a, still, -, bor...",Austen


In [11]:
# get rid of stopwords/punctuation and lemmatize the tokens
for i, sentence in enumerate(sentences['text']):
    sentences.loc[i, 'text'] = ' '.join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

In [12]:
vectorizer = TfidfVectorizer(
    min_df=2, use_idf=True, norm=u'l2', smooth_idf=True, ngram_range=(1,2))

# applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[["text", "author"]]], axis=1)

# keep in mind that the log base 2 of 1 is 0,
# so a tf-idf score of 0 indicates that the word was present once in that sentence.
sentences.head()

Unnamed: 0,abide,ability,able,able bear,able persuade,abominate,abroad,absence,absence home,absent,...,young people,young person,young sister,young woman,youth,youth say,zeal,zealous,text,author
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sir Walter Elliot Kellynch Hall Somersetshire ...,Austen
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,page favourite volume open ELLIOT KELLYNCH HALL,Austen
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Austen
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Walter Elliot bear March marry July Elizabeth ...,Austen
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Anne bear August bear son November Mary bear N...,Austen


In [17]:
Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9028047764509859

Test set score: 0.8642232403165347
----------------------Random Forest Scores----------------------
Training set score: 0.973340738683699

Test set score: 0.8779675135360266
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8611496806442654

Test set score: 0.8429820907955019
