In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
stop_words = stopwords.words('english')
lem = WordNetLemmatizer()
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
f = open('data.txt', 'r',encoding='ISO-8859-1')
text = f.read()
text[:100]

'The Project Gutenberg EBook of Man to Man, by Jackson Gregory\n\nThis eBook is for the use of anyone a'

In [3]:
a = text.split()
len(a)

78078

In [4]:
b = text.split("\n")
len(b)

9770

In [5]:
sen_tokens = sent_tokenize(text)
len(sen_tokens)

5560

In [6]:
df = pd.DataFrame({'sentences':sen_tokens})
df.head()

Unnamed: 0,sentences
0,"The Project Gutenberg EBook of Man to Man, by ..."
1,"You may copy it, give it away or\nre-use it un..."
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.


In [7]:
def cleaning(data):
    
    stop_words = stopwords.words('english')
    lem = WordNetLemmatizer()
    
    #1. Tokenize
    #text_tokens = word_tokenize(data.lower())
    text_tokens = word_tokenize(data) # removed lower for tagging

    
    #2.Remove puncs
    text_tokens = [t for t in text_tokens if t.isalpha()]
    
    #3. stop words
    text_tokens = [t for t in text_tokens if not t in stop_words]
    
    #4. Lemma
    text_tokens = [lem.lemmatize(t) for t in text_tokens]
    
    #5 Join
    return " ".join(text_tokens)

In [8]:
df["sentences_2"] = df['sentences'].apply(cleaning)
df.head()

Unnamed: 0,sentences,sentences_2
0,"The Project Gutenberg EBook of Man to Man, by ...",The Project Gutenberg EBook Man Man Jackson Gr...
1,"You may copy it, give it away or\nre-use it un...",You may copy give away term Project Gutenberg ...
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...,MAN TO MAN BY JACKSON GREGORY AUTHOR OF JUDITH...
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...,ILLUSTRATED BY SHEPHERD GROSSET DUNLAP PUBLISH...
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.,MISS BLUE CLOAK KNOWS WHEN SHE BEAT III


In [9]:
print(df.sentences[0])


The Project Gutenberg EBook of Man to Man, by Jackson Gregory

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.


In [10]:
print(df.sentences_2[0])


The Project Gutenberg EBook Man Man Jackson Gregory This eBook use anyone anywhere cost almost restriction whatsoever


# PoST

In [11]:
df['sentences_3'] = df['sentences_2'].apply(lambda x : x.split())
df.head()

Unnamed: 0,sentences,sentences_2,sentences_3
0,"The Project Gutenberg EBook of Man to Man, by ...",The Project Gutenberg EBook Man Man Jackson Gr...,"[The, Project, Gutenberg, EBook, Man, Man, Jac..."
1,"You may copy it, give it away or\nre-use it un...",You may copy give away term Project Gutenberg ...,"[You, may, copy, give, away, term, Project, Gu..."
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...,MAN TO MAN BY JACKSON GREGORY AUTHOR OF JUDITH...,"[MAN, TO, MAN, BY, JACKSON, GREGORY, AUTHOR, O..."
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...,ILLUSTRATED BY SHEPHERD GROSSET DUNLAP PUBLISH...,"[ILLUSTRATED, BY, SHEPHERD, GROSSET, DUNLAP, P..."
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.,MISS BLUE CLOAK KNOWS WHEN SHE BEAT III,"[MISS, BLUE, CLOAK, KNOWS, WHEN, SHE, BEAT, III]"


In [12]:
df['sentences_4'] = df['sentences_3'].apply(lambda x : nltk.pos_tag(x))
df.head()

Unnamed: 0,sentences,sentences_2,sentences_3,sentences_4
0,"The Project Gutenberg EBook of Man to Man, by ...",The Project Gutenberg EBook Man Man Jackson Gr...,"[The, Project, Gutenberg, EBook, Man, Man, Jac...","[(The, DT), (Project, NNP), (Gutenberg, NNP), ..."
1,"You may copy it, give it away or\nre-use it un...",You may copy give away term Project Gutenberg ...,"[You, may, copy, give, away, term, Project, Gu...","[(You, PRP), (may, MD), (copy, VB), (give, VB)..."
2,MAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nA...,MAN TO MAN BY JACKSON GREGORY AUTHOR OF JUDITH...,"[MAN, TO, MAN, BY, JACKSON, GREGORY, AUTHOR, O...","[(MAN, NN), (TO, NNP), (MAN, NNP), (BY, NNP), ..."
3,ILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGR...,ILLUSTRATED BY SHEPHERD GROSSET DUNLAP PUBLISH...,"[ILLUSTRATED, BY, SHEPHERD, GROSSET, DUNLAP, P...","[(ILLUSTRATED, VBN), (BY, NNP), (SHEPHERD, NNP..."
4,MISS BLUE CLOAK KNOWS WHEN SHE'S BEAT\n III.,MISS BLUE CLOAK KNOWS WHEN SHE BEAT III,"[MISS, BLUE, CLOAK, KNOWS, WHEN, SHE, BEAT, III]","[(MISS, NNP), (BLUE, NNP), (CLOAK, NNP), (KNOW..."


# Vectorization

**Count Vectorization**

In [13]:
#Apply to series not to df itself.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [16]:
X_train = df["sentences_2"]
vectorizer = CountVectorizer().fit(X_train)

In [19]:
vectorizer.get_feature_names()[:5]

['aback', 'abandoned', 'abide', 'abiding', 'ability']

In [21]:
len(vectorizer.get_feature_names()) # corpus unique words, number of features, colons.

6130

In [24]:
X_train_count = vectorizer.transform(X_train)
a = pd.DataFrame( X_train_count.toarray() )
a.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6120,6121,6122,6123,6124,6125,6126,6127,6128,6129
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X_train_count2 = vectorizer.transform(X_train)
b = pd.DataFrame( X_train_count2.toarray() ,columns = vectorizer.get_feature_names())
b.head()

Unnamed: 0,aback,abandoned,abide,abiding,ability,able,aboard,about,abreast,abrupt,...,york,you,young,youngest,your,yours,yourse,youth,youthful,zest
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# tf-idf

In [30]:
X_train = df["sentences_2"]
tf_idf_vectorizer = TfidfVectorizer().fit(X_train)
X_train_tf_idf = tf_idf_vectorizer.transform(X_train)

In [31]:
c = pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())
c.head()

Unnamed: 0,aback,abandoned,abide,abiding,ability,able,aboard,about,abreast,abrupt,...,york,you,young,youngest,your,yours,yourse,youth,youthful,zest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.081525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.214756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
c.max()

aback        0.436327
abandoned    0.367419
abide        0.320646
abiding      0.340290
ability      0.322452
               ...   
yours        0.926886
yourse       0.681416
youth        0.411763
youthful     0.339579
zest         0.383600
Length: 6130, dtype: float64

In [40]:
c.sum().sort_values(ascending = False).head(8)

blenham    121.693580
steve      119.792142
packard    114.215786
terry      102.879166
said        91.987103
he          90.786182
man         88.285371
and         81.219270
dtype: float64

In [41]:
b.sum().sort_values(ascending = False).head(8)

steve      542
packard    541
blenham    524
man        445
terry      409
he         360
and        310
said       307
dtype: int64