# Webscraping using Selenium and Beautiful Soup
## Newspaper: [Samakal](https://en.samakal.com/)



#### Task Summary:
1. Collected all article links from [here](https://en.samakal.com/search?q=road+accident) using [Instant Data Scraper](https://en.samakal.com/search?q=road+accident) ans saved it as a .csv file.
2. Scraped information from the links using Selenium and BeautifulSoup as the website contains Javascript.
3. Stored the data in a .csv file.

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

In [None]:
def getPage(url):
        """
        Function which will retrieve a website having javascript from the given URL using selenium webdriver and make a Beautifulsoup object
        """
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_driver_path="C:\Development\chromedriver.exe"
        driver = webdriver.Chrome(executable_path=chrome_driver_path,options=chrome_options)
        driver.get(url)
        time.sleep(3)
        page_source=driver.page_source
        bs = BeautifulSoup(page_source, 'html.parser')
        driver.close()
        return bs

In [None]:
df=pd.read_csv('links.csv')

In [None]:
links = df.links

headings=[]
published_time=[]
full_text = []
i=0
for link in links:
        i+=1
        URL = link
        soup = getPage(URL)
        
        heading=soup.select_one('h1.detail-headline').get_text()
        date=soup.select_one('span.detail-time').get_text()        
        text=soup.select_one('div.description').get_text()
        
        headings.append(heading)
        published_time.append(date)
        full_text.append(text)
        print(i)
       
        
df['description_text']=headings
df['published_time']=published_time
df['full_text'] = full_text

In [None]:
df.head()

In [None]:
def clean_date(row):
    words=row['published_time'].split(" ")[2:5]
    row['published_time']= " ".join(words)
    return row

In [None]:
df_2=df.copy()
df_2=df_2.apply(clean_date,axis=1)

In [None]:
df_2.head()

In [None]:
def clean_description(row):
    para=row['full_text'].strip("\n\ufeff")
    row["full_text"]=" ".join(para.split("\n\n"))
    return row

In [None]:
df_2=df_2.apply(clean_description,axis=1)


In [None]:
df_2.head()

In [None]:
df_2.to_csv('samakal.csv', index = False)