In [21]:
import nltk

In [22]:
from nltk.corpus import brown

## steps
- Data collection
- Tokenization
- Stopword Removal
- Stemming/Lemmatization
- Build a common vocab
- Vectorize the sentence
- Machine Learning Modeling

In [23]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [143]:
data = brown.sents(categories=['adventure'])[:5]

In [144]:
print(data)

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ['He', 'certainly', "didn't", 'want', 'a', 'wife', 'who', 'was', 'fickle', 'as', 'Ann', '.'], ['If', 'he', 'had', 'married', 'her', ',', "he'd", 'have', 'been', 'asking', 'for', 'trouble', '.'], ['But', 'all', 'of', 'this', 'was', 'rationalization', '.']]


In [26]:
document = ""
for i in range(5):
    document += ' '.join(data[i])
print(document)

Dan Morgan told himself he would forget Ann Turner .He was well rid of her .He certainly didn't want a wife who was fickle as Ann .If he had married her , he'd have been asking for trouble .But all of this was rationalization .


In [192]:
test = "Dan Morgan told himself he would forget Ann Turner. He was well rid of her. He certainly didn't want a wife who was fickle as Ann. If he had married her , he'd have been asking for trouble. But all of this was rationalization ."


In [193]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [194]:
corpus = sent_tokenize(test)
print(corpus)

['Dan Morgan told himself he would forget Ann Turner.', 'He was well rid of her.', "He certainly didn't want a wife who was fickle as Ann.", "If he had married her , he'd have been asking for trouble.", 'But all of this was rationalization .']


In [195]:
from nltk.tokenize import RegexpTokenizer

In [196]:
text = "Email me all the10 class learning assignments of the class 1,3,4 at xyz@gmail.com"
reg_exp = "[a-zA-Z]+"
all_word_except = "[\b (?:the) \b]"
tokenizer = RegexpTokenizer(reg_exp)
text = tokenizer.tokenize(text)
print(text)

['Email', 'me', 'all', 'the', 'class', 'learning', 'assignments', 'of', 'the', 'class', 'at', 'xyz', 'gmail', 'com']


In [197]:
from nltk.corpus import stopwords

In [198]:
stop = set(stopwords.words('english'))

In [199]:
def remove_stopwords(sent,stopwords):
    useful_words = [w for w in sent if w not in stopwords]
    return useful_words

In [200]:
text = remove_stopwords(text,stop)

In [201]:
print(text)

['Email', 'class', 'learning', 'assignments', 'class', 'xyz', 'gmail', 'com']


## Stemming

In [202]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [203]:
ps = PorterStemmer()

In [204]:
ps.stem("running")

'run'

In [205]:
ps.stem("generous")

'gener'

## lemmatization

In [206]:
from nltk.stem import WordNetLemmatizer

In [207]:
wn = WordNetLemmatizer()

In [208]:
from sklearn.feature_extraction.text import CountVectorizer

In [209]:
print(corpus)

['Dan Morgan told himself he would forget Ann Turner.', 'He was well rid of her.', "He certainly didn't want a wife who was fickle as Ann.", "If he had married her , he'd have been asking for trouble.", 'But all of this was rationalization .']


In [210]:
cv = CountVectorizer()

In [213]:
vectorized_words = cv.fit_transform(corpus).toarray()
vectorized_words

array([[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2, 1, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [212]:
cv.vocabulary_

{'dan': 7,
 'morgan': 19,
 'told': 24,
 'himself': 16,
 'he': 14,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner': 26,
 'was': 28,
 'well': 29,
 'rid': 22,
 'of': 20,
 'her': 15,
 'certainly': 6,
 'didn': 8,
 'want': 27,
 'wife': 31,
 'who': 30,
 'fickle': 9,
 'as': 2,
 'if': 17,
 'had': 12,
 'married': 18,
 'have': 13,
 'been': 4,
 'asking': 3,
 'for': 10,
 'trouble': 25,
 'but': 5,
 'all': 0,
 'this': 23,
 'rationalization': 21}

## TF IDF normalization

In [214]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [215]:
import numpy as np

In [216]:
tf = TfidfVectorizer()
vd = tf.fit_transform(corpus).toarray()
print(vd)

[[0.         0.28581119 0.         0.         0.         0.
  0.         0.3542556  0.         0.         0.         0.3542556
  0.         0.         0.19958143 0.         0.3542556  0.
  0.         0.3542556  0.         0.         0.         0.
  0.3542556  0.         0.3542556  0.         0.         0.
  0.         0.         0.3542556 ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.27933573 0.40002359 0.         0.
  0.         0.         0.40002359 0.         0.49581892 0.
  0.         0.         0.         0.         0.33205571 0.49581892
  0.         0.         0.        ]
 [0.         0.27809183 0.34468766 0.         0.         0.
  0.34468766 0.         0.34468766 0.34468766 0.         0.
  0.         0.         0.19419101 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.34468766 0.23084134 0.
  0.34468766 

## Assignment

In [217]:
from sklearn.datasets import fetch_20newsgroups

In [228]:
newsgroups = fetch_20newsgroups()

In [229]:
X = np.array(newsgroups.data)
Y_names = newsgroups.target_names
Y = np.array(newsgroups.target)


In [230]:
Y_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

()