In [1]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim
import warnings
warnings.filterwarnings("ignore")

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


symbolic link created for C:\ProgramData\Anaconda3\lib\site-packages\spacy\data\en <<===>> C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[+] Linking successful
C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm -->
C:\ProgramData\Anaconda3\lib\site-packages\spacy\data\en
You can now load the model via spacy.load('en')


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [2]:
# utility function for standard text cleaning
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [3]:
# load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# the chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [4]:
# parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [5]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# combine the sentences from the two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [6]:
# get rid off stop words and punctuation
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]

### 1) Train your own word2vec representations as we did in our first example in the checkpoint. But, you need to experiment with the hyperparameters of the vectorization step. Modify the hyperparameters and run the classification models again. Can you wrangle any improvements?

In [16]:
# train word2vec on the the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=8,
    min_count=1,
    window=5,
    sg=0,
    sample=1e-2,
    size=75,
    hs=2
)

In [17]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('near', 0.9816954135894775), ('weather', 0.9809759855270386), ('spread', 0.977932333946228), ('declare', 0.9776087999343872), ('health', 0.9772489070892334)]
dinner
0.98542815
0.7942478


In [18]:
word2vec_arr = np.zeros((sentences.shape[0],75))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

sentences.head()

Unnamed: 0,author,text,0,1,2,3,4,5,6,7,...,65,66,67,68,69,70,71,72,73,74
0,Carroll,"[Alice, begin, tired, sit, sister, bank, have,...",0.550017,0.148974,-0.279875,0.187857,0.433405,-0.171449,-0.061303,0.290085,...,0.434081,0.487064,-0.499568,0.192414,0.054144,-0.235941,-0.274399,0.102695,0.273472,0.000306
1,Carroll,"[consider, mind, hot, day, feel, sleepy, stupi...",0.376171,0.068341,-0.218908,0.205227,0.354012,-0.088969,-0.018588,0.208249,...,0.291432,0.365212,-0.395642,0.137223,0.028208,-0.265569,-0.243105,0.028983,0.208997,0.008521
2,Carroll,"[remarkable, Alice, think, way, hear, Rabbit]",0.790599,0.343724,-0.473375,0.167352,0.609409,-0.322803,-0.098904,0.462249,...,0.682774,0.632418,-0.702982,0.252615,0.029027,-0.188953,-0.362009,0.159754,0.327205,0.016503
3,Carroll,"[oh, dear]",0.548018,0.171755,-0.305487,0.223364,0.494166,-0.073598,-0.064427,0.305403,...,0.491512,0.531609,-0.47029,0.20131,0.03628,-0.327955,-0.173333,-0.025737,0.214861,0.006197
4,Carroll,"[oh, dear]",0.548018,0.171755,-0.305487,0.223364,0.494166,-0.073598,-0.064427,0.305403,...,0.491512,0.531609,-0.47029,0.20131,0.03628,-0.327955,-0.173333,-0.025737,0.214861,0.006197


In [21]:
Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.836489439184268

Test set score: 0.8115783724740578
----------------------Random Forest Scores----------------------
Training set score: 0.9956300072833212

Test set score: 0.8241398143091206
----------------------Gradient Boosting Scores----------------------
Training set score: 0.9122359796067007

Test set score: 0.823593664664118
