-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWebScraping&NLTK.py
130 lines (54 loc) · 1.94 KB
/
WebScraping&NLTK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Store url
url = 'http://www.gutenberg.org/files/28885/28885-h/28885-h.htm'
# Import `requests`package
import requests
# Make the request and check object type
r = requests.get(url)
#print (type (r))
# Extract HTML from Response object and print
html = r.text
#print (html)
# Import BeautifulSoup from bs4
from bs4 import BeautifulSoup
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup (html, 'html5lib')
#Check the type and print some reassuring objects from soup: get soup title, get soup title as a string
'''
print(type(soup))
print(soup.title.string)
print(soup.title)
print(soup.find_all('a')[:8])
'''
# Get the text out of the soup and print it
text = soup.get_text()
#print(text)
# Import RegexpTokenizer from nltk.tokenize
#Import Natural Language TooKit package
import nltk
from nltk.tokenize import RegexpTokenizer
#Create the tokenizer
tokenizer = RegexpTokenizer('\w+')
#Create the tokens
tokens = tokenizer.tokenize (text)
#print (tokens [:14])
#We are going to clean the data and convert all capital letters to avoid the duplications
words = []
#We loop through the tokens and make everything to lower case
for word in tokens :
words.append (word.lower())
#We import all the English stopwords to clean the data
sw = nltk.corpus.stopwords.words('English')
#Initialize a list to loop through the words all the stopwords
words_ns = []
for word in words:
if word not in sw:
words_ns.append (word)
#Just a final sanity check
#print(words_ns[:5])
#Now we do a frequency plot to visualize the frequency of words in the novel
import matplotlib.pyplot as plt
import seaborn as sns
#we create a frequency distribution using the frequency object from NLTK
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)
plt.show()