#Cleaning

In [1]:
from collections import Counter
import nltk
import spacy
import re
import pandas as pd
import numpy as np

!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [3]:
# Import the data that you just downloaded
from nltk.corpus import gutenberg

In [4]:
# Grab and process the raw data
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [5]:
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

In [6]:
# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash '--'. Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text


In [7]:
# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

In [8]:
persuasion=text_cleaner(persuasion)
alice=text_cleaner(alice)

In [9]:
#Tokenizaton
nlp=spacy.load('en')
persuasion_doc=nlp(persuasion)
alice_doc=nlp(alice)

In [10]:
persuasion_sents=[[sent,'Austen']for sent in persuasion_doc.sents]
alice_sents=[[sent,'Carroll']for sent in alice_doc.sents]
alice_sents[:2]

[[Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?',
  'Carroll'],
 [So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.,
  'Carroll']]

In [11]:
sentences=pd.DataFrame(alice_sents+persuasion_sents,columns=['text','author'])
sentences.head(2)

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll


In [12]:
for i, sentens in enumerate(sentences['text']):
  sentences.loc[i,'text']=' '.join([token.lemma_ for token in sentens if not token.is_punct and not token.is_stop])
  
sentences.head(2)

Unnamed: 0,text,author
0,Alice begin tired sit sister bank have twice p...,Carroll
1,consider mind hot day feel sleepy stupid pleas...,Carroll


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(analyzer='word')
X=vectorizer.fit_transform(sentences['text'])

In [14]:
bow_df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
bow_df.head(1)



Unnamed: 0,1st,29th,abbreviation,abdication,abide,ability,able,abode,abominable,abominate,abroad,absence,absent,absolute,absolutely,abstraction,absurd,absurdity,abundance,abuse,abydos,accent,accept,acceptable,acceptance,accession,accident,accidental,accidentally,accommodate,accommodation,accompany,accomplish,accomplished,accomplishment,accord,accordingly,accost,account,accounting,...,wow,wrap,wrapt,wreck,wretched,wretchedly,wretchedness,wriggle,wrinkle,wrist,write,writhing,writing,wrong,wrought,yard,yarmouth,yawn,ye,year,yearly,yell,yelp,yeoman,yer,yes,yesterday,yestermorn,yield,yielding,you,young,younker,youth,youthful,zeal,zealand,zealous,zealously,zigzag
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
sentences=pd.concat([bow_df,sentences[['text','author']]],axis=1)
sentences.head(1)

Unnamed: 0,1st,29th,abbreviation,abdication,abide,ability,able,abode,abominable,abominate,abroad,absence,absent,absolute,absolutely,abstraction,absurd,absurdity,abundance,abuse,abydos,accent,accept,acceptable,acceptance,accession,accident,accidental,accidentally,accommodate,accommodation,accompany,accomplish,accomplished,accomplishment,accord,accordingly,accost,account,accounting,...,wrapt,wreck,wretched,wretchedly,wretchedness,wriggle,wrinkle,wrist,write,writhing,writing,wrong,wrought,yard,yarmouth,yawn,ye,year,yearly,yell,yelp,yeoman,yer,yes,yesterday,yestermorn,yield,yielding,you,young,younker,youth,youthful,zeal,zealand,zealous,zealously,zigzag,text,author
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split,GridSearchCV

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9261687571265679

Test set score: 0.8641025641025641
----------------------Random Forest Scores----------------------
Training set score: 0.9723489167616876

Test set score: 0.8294871794871795
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8275370581527937

Test set score: 0.8128205128205128


#Assignments
1. Your task is to increase the performance of the models that you implemented in the bank-of-words example. 

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split,GridSearchCV

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

In [20]:
# We can tune hyperparameters to improve the out come.
lr_params = {"penalty": ["l1", "l2"]}
lr = LogisticRegression()

rfc_params = {"n_estimators": [3, 5, 10, 15],
              "max_depth": [2, 3, 4, 5],
              "min_samples_split": [3, 5, 7, 9]}
rfc = RandomForestClassifier()

gbc_params = {"n_estimators": [3, 5, 10, 15],
              "max_depth": [2, 3, 4, 5],
              "min_samples_split": [3, 5, 7, 9]}
gbc = GradientBoostingClassifier()

clf_lr = GridSearchCV(lr, lr_params, cv=5)
clf_lr.fit(X_train, y_train)

clf_rfc = GridSearchCV(rfc, rfc_params, cv=5)
clf_rfc.fit(X_train, y_train)

clf_gbc = GridSearchCV(gbc, gbc_params, cv=5)
clf_gbc.fit(X_train, y_train)


print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', clf_lr.score(X_train, y_train))
print('\nTest set score:', clf_lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', clf_rfc.score(X_train, y_train))
print('\nTest set score:', clf_rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', clf_gbc.score(X_train, y_train))
print('\nTest set score:', clf_gbc.score(X_test, y_test))

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



----------------------Logistic Regression Scores----------------------
Training set score: 0.9261687571265679

Test set score: 0.8641025641025641
----------------------Random Forest Scores----------------------
Training set score: 0.669897377423033

Test set score: 0.6735042735042736
----------------------Gradient Boosting Scores----------------------
Training set score: 0.7887685290763968

Test set score: 0.7893162393162393


The only improvement is that in Random Forest overfitting is solved.

2. In the 2-gram example above, you only used 2-gram as your features. This time, use both 1-gram and 2-gram features together as your feature set. Run the same models as in the example and compare the results.

In [21]:
# Use 2-grams
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,2))
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([bow_df, sentences[["text", "author"]]], axis=1)
sentences.head()



Unnamed: 0,1st,29th,29th september,abbreviation,abbreviation living,abdication,abdication neighbour,abide,abide consequence,abide figure,ability,ability affection,ability awkwardness,ability difficulty,able,able attempt,able avail,able avoid,able bear,able convince,able devise,able eat,able far,able feign,able join,able judge,able leave,able letter,able live,able marry,able persuade,able regard,able remain,able return,able ring,able rise,able set,able shew,able speak,able tell,...,young young,younker,youth,youth beauty,youth bloom,youth early,youth father,youth fine,youth hardly,youth hope,youth jaw,youth kill,youth learn,youth like,youth mention,youth possibly,youth restore,youth say,youth spring,youth value,youth vigour,youthful,youthful infatuation,zeal,zeal business,zeal common,zeal dwell,zeal sport,zeal think,zealand,zealand australia,zealous,zealous officer,zealous subject,zealously,zealously discharge,zigzag,zigzag go,text,author
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,remarkable Alice think way hear Rabbit oh dear,Carroll
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,oh dear,Carroll
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,shall late,Carroll


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

useing both 1-gram and 2-gram features together as feature set have better results comparing to only 2-gram. but not better than 1-gram.