# Frequency Filtering

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups = fetch_20newsgroups(subset='train')

In [3]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
print(newsgroups.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [5]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
transformed_vector = count_vect.fit_transform(newsgroups.data)
print(f'Transformed vectors have {transformed_vector.shape[1]} words in its vocabulary')

Transformed vectors have 130107 words in its vocabulary


In [9]:
count_vec = CountVectorizer(min_df=0.0, max_df=0.7) # Specify the minimum and maximum thresholds of document frequency
# float as min_df and max_df is document frequency as a ratio
transformed_vector = count_vec.fit_transform(newsgroups.data)
print(f'Transformed vectors have {transformed_vector.shape[1]} words in its vocabulary when frequency limits are set')

Transformed vectors have 130094 words in its vocabulary when frequency limits are set


In [10]:
count_vec = CountVectorizer(min_df=0, max_df=100)
# Integer as min_df and max_df is absolute frequency
transformed_vector = count_vec.fit_transform(newsgroups.data)
print(f'Transformed vectors have {transformed_vector.shape[1]} words in its vocabulary when frequency limits are set')

Transformed vectors have 127701 words in its vocabulary when frequency limits are set


In [11]:
count_vec = CountVectorizer(min_df=2, max_df=100)
# Integer as min_df and max_df is absolute frequency
transformed_vector = count_vec.fit_transform(newsgroups.data)
print(f'Transformed vectors have {transformed_vector.shape[1]} words in its vocabulary when frequency limits are set')

Transformed vectors have 54030 words in its vocabulary when frequency limits are set


# Stemming

In [12]:
from nltk import word_tokenize, stem
import pandas as pd

In [13]:
input_tokens = ['overwhelming', 'overwhelmingly',
                'hushed', 'hush',
                'functional', 'functions',
                'lying', 'lied',
                'fairly',
                'destabilize', 'stability',
                'friendship', 'friendships', 'friendly', 'friendless',
                'connections', 'connection', 'connect', 'connected', 'connectionless',
                'the', 'these', 'those',
                'motivational', 'motivate', 'motivated', 'motivation', 'motivational']

In [14]:
ps = stem.PorterStemmer()

In [15]:
ps_stemmed_tokens = [ps.stem(token) for token in input_tokens]
ps_stemmed_tokens

['overwhelm',
 'overwhelmingli',
 'hush',
 'hush',
 'function',
 'function',
 'lie',
 'lie',
 'fairli',
 'destabil',
 'stabil',
 'friendship',
 'friendship',
 'friendli',
 'friendless',
 'connect',
 'connect',
 'connect',
 'connect',
 'connectionless',
 'the',
 'these',
 'those',
 'motiv',
 'motiv',
 'motiv',
 'motiv',
 'motiv']

In [16]:
ls = stem.LancasterStemmer()

In [17]:
ls_stemmed_tokens = [ls.stem(token) for token in input_tokens]
ls_stemmed_tokens

['overwhelm',
 'overwhelm',
 'hush',
 'hush',
 'funct',
 'funct',
 'lying',
 'lied',
 'fair',
 'dest',
 'stabl',
 'friend',
 'friend',
 'friend',
 'friendless',
 'connect',
 'connect',
 'connect',
 'connect',
 'connectionless',
 'the',
 'thes',
 'thos',
 'mot',
 'mot',
 'mot',
 'mot',
 'mot']

In [18]:
stems_df = pd.DataFrame(dict(
    words=input_tokens,
    PorterStemmer=ps_stemmed_tokens,
    LancasterStemmer=ls_stemmed_tokens
))
stems_df

Unnamed: 0,words,PorterStemmer,LancasterStemmer
0,overwhelming,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm
2,hushed,hush,hush
3,hush,hush,hush
4,functional,function,funct
5,functions,function,funct
6,lying,lie,lying
7,lied,lie,lied
8,fairly,fairli,fair
9,destabilize,destabil,dest


Porter and Lancaster Stemmer are for English. Snowball Stemmer can be used for non-English words

In [19]:
stem.SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [20]:
ss = stem.SnowballStemmer('english')
ss_stemmed_tokens = [ss.stem(token) for token in input_tokens]
ss_stemmed_tokens

['overwhelm',
 'overwhelm',
 'hush',
 'hush',
 'function',
 'function',
 'lie',
 'lie',
 'fair',
 'destabil',
 'stabil',
 'friendship',
 'friendship',
 'friend',
 'friendless',
 'connect',
 'connect',
 'connect',
 'connect',
 'connectionless',
 'the',
 'these',
 'those',
 'motiv',
 'motiv',
 'motiv',
 'motiv',
 'motiv']

In [21]:
stems_df['SnowballStemmer'] = ss_stemmed_tokens
stems_df

Unnamed: 0,words,PorterStemmer,LancasterStemmer,SnowballStemmer
0,overwhelming,overwhelm,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm,overwhelm
2,hushed,hush,hush,hush
3,hush,hush,hush,hush
4,functional,function,funct,function
5,functions,function,funct,function
6,lying,lie,lying,lie
7,lied,lie,lied,lie
8,fairly,fairli,fair,fair
9,destabilize,destabil,dest,destabil


# Lemmatization

Lemmatization reduces the words to root words (lemma)

In [26]:
ps.stem('definitions') # output might not be a real word

'definit'

In [24]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Safiuddin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
wnl = stem.WordNetLemmatizer()
print(wnl.lemmatize('definitions'))

definition


In [28]:
print(f'Adjective: {wnl.lemmatize("running", pos="a")}')
print(f'Adverb: {wnl.lemmatize("running", pos="r")}')
print(f'Verb: {wnl.lemmatize("running", pos="v")}')
print(f'Noun: {wnl.lemmatize("running", pos="n")}')

Adjective: running
Adverb: running
Verb: run
Noun: running


In [32]:
wnl_lemmatized_tokens = [wnl.lemmatize(token, pos='v') for token in input_tokens]
wnl_lemmatized_tokens

['overwhelm',
 'overwhelmingly',
 'hush',
 'hush',
 'functional',
 'function',
 'lie',
 'lie',
 'fairly',
 'destabilize',
 'stability',
 'friendship',
 'friendships',
 'friendly',
 'friendless',
 'connections',
 'connection',
 'connect',
 'connect',
 'connectionless',
 'the',
 'these',
 'those',
 'motivational',
 'motivate',
 'motivate',
 'motivation',
 'motivational']

In [33]:
stems_df['WordNetLemmatizer'] = wnl_lemmatized_tokens

In [34]:
stems_df

Unnamed: 0,words,PorterStemmer,LancasterStemmer,SnowballStemmer,WordNetLemmatizer
0,overwhelming,overwhelm,overwhelm,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm,overwhelm,overwhelmingly
2,hushed,hush,hush,hush,hush
3,hush,hush,hush,hush,hush
4,functional,function,funct,function,functional
5,functions,function,funct,function,function
6,lying,lie,lying,lie,lie
7,lied,lie,lied,lie,lie
8,fairly,fairli,fair,fair,fairly
9,destabilize,destabil,dest,destabil,destabilize


# Parts-of-Speech (POS) Tagging

In [35]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Safiuddin\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [36]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Safiuddin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [37]:
nltk.help.upenn_tagset() # Codes for parts of speech

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [38]:
nltk.help.upenn_tagset('NN') # Codes for nouns

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [39]:
text = 'I refuse to let this refuse get me down'

In [40]:
tokens = nltk.word_tokenize(text)
tagged_words = nltk.pos_tag(tokens)
tagged_words

[('I', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('let', 'VB'),
 ('this', 'DT'),
 ('refuse', 'NN'),
 ('get', 'VB'),
 ('me', 'PRP'),
 ('down', 'RP')]

In [41]:
from nltk.probability import FreqDist
fdist = FreqDist(tagged_words)
fd_tagged = FreqDist(tag for (word, tag) in tagged_words)
fd_tagged.most_common(10) # Get most common parts of speech

[('PRP', 2), ('VB', 2), ('VBP', 1), ('TO', 1), ('DT', 1), ('NN', 1), ('RP', 1)]

In [42]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Safiuddin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [43]:
nltk.corpus.brown.words()[:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [44]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())

In [45]:
type(text)

nltk.text.Text

In [46]:
text.similar('boy') # returns similar words based on POS

man time day way girl year house people world city family state room
country car woman program church government job


In [47]:
text.similar('run')

get be do in see work go have take make put and find time look day say
use come show
