In [0]:
import urllib.request
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize
from bs4 import BeautifulSoup

# Reading from URL
wikiurl = "https://en.wikipedia.org/wiki/Google"
openurl = urllib.request.urlopen(wikiurl)
soup = BeautifulSoup(openurl.read(), "lxml")
# print(soup)

# get text
text = soup.body.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = ' '.join(chunk for chunk in chunks if chunk)


# Saving to a Text File
with open('Input', 'w') as text_file:
    text_file.write(str(text.encode("utf-8")))

# Reading from a Text File
with open('Input', 'r') as text_file:
    read_data = text_file.read()

In [0]:
import nltk
nltk.download('punkt')

with open('Input', 'r') as text_file:
    read_data = text_file.read()
#Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens.

stokens=nltk.sent_tokenize(read_data)
print("sentence tokenization", stokens)

wtoken=nltk.word_tokenize(read_data)
print("word tokenization", wtoken)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
sentence tokenization ["b'Google From Wikipedia, the free encyclopedia Jump to navigation Jump to search This article is about the company.", 'For the search engine, see Google Search.', 'For other uses, see Google (disambiguation).', 'Not to be confused with Googol or Goggles.', "American multinational Internet and technology corporation Google LLCGoogle\\'s logo since 2015[update]Google\\'s headquarters, the GoogleplexFormerlyGoogle Inc. (1998\\xe2\\x80\\x932017)TypeSubsidiaryIndustryInternetCloud computingComputer softwareComputer hardwareArtificial intelligenceAdvertisingFoundedSeptember\\xc2\\xa04, 1998; 21 years ago\\xc2\\xa0(1998-09-04)[a] in Menlo Park, California, U.S.FoundersLarry PageSergey BrinHeadquarters1600 Amphitheatre Parkway, Mountain View, California, U.S.Area servedWorldwideKey peopleSundar Pichai (CEO)Ruth Porat (CFO)ProductsList of Google productsRevenue6

In [0]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

with open('Input', 'r') as text_file:
    read_data = text_file.read()
text=nltk.word_tokenize(read_data)

tagged=nltk.pos_tag(text)
 # POS-tagger, processes a sequence of words, and attaches a part of speech tag to each word.
print(tagged)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[("b'Google", 'NN'), ('From', 'IN'), ('Wikipedia', 'NNP'), (',', ','), ('the', 'DT'), ('free', 'JJ'), ('encyclopedia', 'NN'), ('Jump', 'NNP'), ('to', 'TO'), ('navigation', 'VB'), ('Jump', 'NNP'), ('to', 'TO'), ('search', 'VB'), ('This', 'DT'), ('article', 'NN'), ('is', 'VBZ'), ('about', 'IN'), ('the', 'DT'), ('company', 'NN'), ('.', '.'), ('For', 'IN'), ('the', 'DT'), ('search', 'NN'), ('engine', 'NN'), (',', ','), ('see', 'VBP'), ('Google', 'NNP'), ('Search', 'NNP'), ('.', '.'), ('For', 'IN'), ('other', 'JJ'), ('uses', 'NNS'), (',', ','), ('see', 'VBP'), ('Google', 'NNP'), ('(', '('), ('disambiguation', 'NN'), (')', ')'), ('.', '.'), ('Not', 'RB'), ('to', 'TO'), ('be', 'VB'), ('confused', 'VBN'), ('with', 'IN'), ('Goo

In [0]:
import nltk
from nltk.stem import PorterStemmer
pStemmer=PorterStemmer()
print(pStemmer.stem(read_data))

#Stemming  is the process  for reducing  injected words to their stem,  base root form


b'google from wikipedia, the free encyclopedia jump to navigation jump to search this article is about the company. for the search engine, see google search. for other uses, see google (disambiguation). not to be confused with googol or goggles. american multinational internet and technology corporation google llcgoogle\'s logo since 2015[update]google\'s headquarters, the googleplexformerlygoogle inc. (1998\xe2\x80\x932017)typesubsidiaryindustryinternetcloud computingcomputer softwarecomputer hardwareartificial intelligenceadvertisingfoundedseptember\xc2\xa04, 1998; 21 years ago\xc2\xa0(1998-09-04)[a] in menlo park, california, u.s.founderslarry pagesergey brinheadquarters1600 amphitheatre parkway, mountain view, california, u.s.area servedworldwidekey peoplesundar pichai (ceo)ruth porat (cfo)productslist of google productsrevenue66,001,000,000 us dollar[5] (2014)\xc2\xa0operating income16,496,000,000 us dollar[5] (2014)\xc2\xa0net income14,444,000,000 united states dollar[5] (2014)\x

In [0]:
import nltk
from nltk.stem import LancasterStemmer
lStemmer=LancasterStemmer()
print(lStemmer.stem(read_data))

b'google from wikipedia, the free encyclopedia jump to navigation jump to search this article is about the company. for the search engine, see google search. for other uses, see google (disambiguation). not to be confused with googol or goggles. american multinational internet and technology corporation google llcgoogle\'s logo since 2015[update]google\'s headquarters, the googleplexformerlygoogle inc. (1998\xe2\x80\x932017)typesubsidiaryindustryinternetcloud computingcomputer softwarecomputer hardwareartificial intelligenceadvertisingfoundedseptember\xc2\xa04, 1998; 21 years ago\xc2\xa0(1998-09-04)[a] in menlo park, california, u.s.founderslarry pagesergey brinheadquarters1600 amphitheatre parkway, mountain view, california, u.s.area servedworldwidekey peoplesundar pichai (ceo)ruth porat (cfo)productslist of google productsrevenue66,001,000,000 us dollar[5] (2014)\xc2\xa0operating income16,496,000,000 us dollar[5] (2014)\xc2\xa0net income14,444,000,000 united states dollar[5] (2014)\x

In [0]:
import nltk
from nltk.stem import SnowballStemmer
sStemmer=SnowballStemmer('english')
print(sStemmer.stem(read_data))

b'google from wikipedia, the free encyclopedia jump to navigation jump to search this article is about the company. for the search engine, see google search. for other uses, see google (disambiguation). not to be confused with googol or goggles. american multinational internet and technology corporation google llcgoogle\'s logo since 2015[update]google\'s headquarters, the googleplexformerlygoogle inc. (1998\xe2\x80\x932017)typesubsidiaryindustryinternetcloud computingcomputer softwarecomputer hardwareartificial intelligenceadvertisingfoundedseptember\xc2\xa04, 1998; 21 years ago\xc2\xa0(1998-09-04)[a] in menlo park, california, u.s.founderslarry pagesergey brinheadquarters1600 amphitheatre parkway, mountain view, california, u.s.area servedworldwidekey peoplesundar pichai (ceo)ruth porat (cfo)productslist of google productsrevenue66,001,000,000 us dollar[5] (2014)\xc2\xa0operating income16,496,000,000 us dollar[5] (2014)\xc2\xa0net income14,444,000,000 united states dollar[5] (2014)\x

In [0]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize(read_data))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
b'Google From Wikipedia, the free encyclopedia Jump to navigation Jump to search This article is about the company. For the search engine, see Google Search. For other uses, see Google (disambiguation). Not to be confused with Googol or Goggles. American multinational Internet and technology corporation Google LLCGoogle\'s logo since 2015[update]Google\'s headquarters, the GoogleplexFormerlyGoogle Inc. (1998\xe2\x80\x932017)TypeSubsidiaryIndustryInternetCloud computingComputer softwareComputer hardwareArtificial intelligenceAdvertisingFoundedSeptember\xc2\xa04, 1998; 21 years ago\xc2\xa0(1998-09-04)[a] in Menlo Park, California, U.S.FoundersLarry PageSergey BrinHeadquarters1600 Amphitheatre Parkway, Mountain View, California, U.S.Area servedWorldwideKey peopleSundar Pichai (CEO)Ruth Porat (CFO)ProductsList of Google productsRevenue66,001,000,000 US dollar[5] (2014)\xc2\xa0Operatin

In [0]:
import nltk
from nltk import wordpunct_tokenize, pos_tag, ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')
with open('Input', 'r') as text_file:
    read_data = text_file.read()

print(ne_chunk(pos_tag(wordpunct_tokenize(read_data))))

x=ne_chunk(pos_tag(wordpunct_tokenize(read_data)))
 
print(type(x))
for t in x:
  print(t)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('13', 'CD')
(',', ',')
('2014', 'CD')
(').', 'NN')
('"', 'NNP')
('Google', 'NNP')
('challenges', 'VBZ')
('record', 'VB')
('$', '$')
('5', 'CD')
('billion', 'CD')
(GPE EU/NNP)
('antitrust', 'JJ')
('fine', 'NN')
('".', 'NN')
('Reuters', 'NNP')
('.', '.')
('Archived', 'VBN')
('from', 'IN')
('the', 'DT')
('original', 'JJ')
('on', 'IN')
('December', 'NNP')
('22', 'CD')
(',', ',')
('2018', 'CD')
('.', '.')
('^', 'VB')
(PERSON Fox/NNP)
(',', ',')
(GPE Chris/NNP)
('(', '(')
('January', 'NNP')
('21', 'CD')
(',', ',')
('2019', 'CD')
(').', 'NN')
('"', 'NNP')
(PERSON Google/NNP)
('hit', 'VBD')
('with', 'IN')
('\\', 'NNP')
('xc2', 'NNP')
('\\', 'NNP')
('xa344m', 'NNP')
('GDPR', 'NNP')
('fine', 'JJ')
('".', 'NNP')
('BBC', 'NNP')
('.', '.')
('Archived', 'VBN')
('from', 'IN')
('the', 'DT')
('original', 'JJ')
('on', 'IN')
('January', 'NNP')
('21', 'CD')
(',', ',')
('2019', 'CD')
('.', '.')
('Retrieved', 'VBN')
('January', 'NNP')
('22', 

In [0]:
import nltk
from nltk.util import ngrams

trigrams=list(ngrams(wtoken,3))
print("trigrams output is:", trigrams)


In [2]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

# tfidf_Vect = TfidfVectorizer()
tfidf_Vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data)
# print(tfidf_Vect.vocabulary_)
# clf = MultinomialNB()
clf = KNN(n_neighbors=3)
clf.fit(X_train_tfidf, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_tfidf = tfidf_Vect.transform(twenty_test.data)
predicted = clf.predict(X_test_tfidf)

score = metrics.accuracy_score(twenty_test.target, predicted)
print(score)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


0.630509824747743
