-
Notifications
You must be signed in to change notification settings - Fork 0
/
beautiful_soup.py
124 lines (89 loc) · 2.94 KB
/
beautiful_soup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 20 12:32:25 2022
WEB SCRAPING
@author: Soura
"""
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
## To scrape a website:
# Use the API
# HTML web scraping using tool like bs4
url= "https://insights.blackcoffer.com/future-of-work-how-ai-has-entered-the-workplace/"
## STEP 1:
# Get the HTML
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0"}
r= requests.get(url,headers= headers) # GET request for getting the website data
html_content= r.content # HTML code of the website
#print(html_content)
## STEP 2:
# Parse the HTML
soup= BeautifulSoup(html_content, 'html.parser')
#print(soup)
## STEP 3:
# HTML tree traversal
## Commonly used types of ojects
# Tag >> type(title)
# Navigable string >> type(title.string)
# BeautifulSoup >> type(soup)
# Comment
#title= soup.title ## Get the title of soup
#paras= soup.find_all('p') ## Get all the paras from page
#print(paras)
#print(soup.find('p')) # First para
#print(soup.find('p')['class']) # classes of page
#print(soup.find_all("p", class_= 'lead')) # Find all elements with class lead
## Get text from tags/soup
#print(soup.find('p').get_text())
#print(soup.get_text())
#textfile= soup.get_text()
txt= soup.find(attrs= {"class":"td-post-content"}).text
print(txt)
with open("title.txt", 'w',encoding="utf-8") as file:
file.write(txt)
'''
## Get all anchors tags from page
anchors= soup.find_all('a')
#print(anchor)
all_links= set()
for link in anchors:
if link.get('href')!="#":
linktext= url+ link.get('href')
all_links.add(linktext)
print(linktext)
print(all_links)
'''
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
# Set options to display the window of chrome
options = Options()
options.headless = True
options.add_argument('window-size=1920x1080')
# Multiple urls
file= pd.read_csv("C:\\Users\\Soura\\Downloads\\Input.csv")
urls= list(file["URL"])
urls
time.sleep(2)
# Pass multiple urls using 'for' loop to 'chrome' driver
for i in range(len(urls)):
path =r"C:\\Users\\Soura\\Downloads\\chromedriver_win32"
driver = webdriver.Chrome(path,options=options)
driver.get(urls[i])
#driver.maximize_window()
# Get 'article_title' and 'article_paragraph' using xpath
article_title = driver.find_element_by_xpath('//h1').text
article_paragraph = driver.find_element_by_xpath("//div[contains(@class,'td-post-content')]").text
with open(f'data/{i}.txt','w') as file:
file.write(article_title+"\n" )
file.writelines("% s" %data for data in article_paragraph)
# Quit driver
driver.quit()
import nltk
nltk.download()
f= open("text1.txt")
f1=f.read()
#text=txt.replace('\n','')