## Data Extracting

In [1]:
# importing necessary libraries
import requests
from lxml import html
from selenium import webdriver
import re
import time
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import numpy as np

In [3]:
#loading the dataset
urls_dataset = pd.read_excel('input.xlsx')

In [4]:
urls_dataset.sort_values(by = 'URL_ID', ascending = True, inplace = True)

In [5]:
urls_dataset.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
5,2893.8,https://insights.blackcoffer.com/rise-of-chatb...


In [14]:
# creating a function to extract text from a url with given xpath
from lxml import html
def extract_text(url, xpath):
    response = requests.get(url)
    if response.status_code == 200:
        tree = html.fromstring(response.content.decode('utf-8'))
        extracted_text = []
        elements = tree.xpath(xpath)
        if elements:
            extracted_text.extend([element.text_content() for element in elements])
        return extracted_text
    else:
        print(f'ehh: {response.status_code}')
        return []

In [7]:
# function for cleaning & saving the text in .txt files
def save_to_txt(filename, text):
    text = text.replace('\n', ' ')
    empty_lines = r'^\s*$'
    text = re.sub(empty_lines, '\n', text)
    text = text.replace('\xa0',' ')
    text = re.sub(r'Blackcoffer Insights.*', '', text)
    text = text.lower().strip()
    with open(filename, 'w', encoding = 'utf-8') as file:
        file.write(text)

**It was oberved that xpath for each url has different code for href value. So the code for each url has to be extracted first.**

In [8]:
html_code = []
for idx, url in enumerate(urls_dataset.URL):
    response = requests.get(url)
    if response.status_code == 200:
        lines = response.text.split('\n')
        if len(lines) >= 66:
            html_code.append(lines[65])
        else:
            print("lines error")
    else:
        print(f"ehh in {idx+1}: {response.status_code}")

ehh in 25: 404
ehh in 38: 404


While extracting the href numbers, there was an error 404 in the row 25 & 28. That means there was no page available on the given url. We will remove these two urls later.

In [9]:
numbers = []

for html in html_code:
    soup = BeautifulSoup(html, 'html.parser')
    link_element = soup.find('link', rel='alternate')
    href = link_element.get('href')
    number = href.split('/')[-1]
    numbers.append(number)

In [10]:
# saving the xpaths for each url in a list
xpaths = [f'//*[@id="post-{i}"]/div[2]/div/div[1]' for i in numbers]

In [11]:
# removing the rows with incorrect urls
urls_dataset.drop([24, 37], inplace = True)
urls_dataset.reset_index(drop=True, inplace=True)

In [12]:
# separating urls & url ids as file_names
urls = urls_dataset.URL
file_names = urls_dataset.URL_ID
print(len(urls))
print(len(file_names))

112
112


In [15]:
# extracting
for idx, link in enumerate(urls):
    
    extracted_text = extract_text(link, xpaths[idx])
    extracted_text = ''.join(extracted_text)
    
    if extracted_text:
        output_file = f"C:\\Users\\rohit\\Python\\drive-download-20230909T190744Z-001\\extracted_articles\\{file_names[idx]}.txt"
        save_to_txt(output_file, extracted_text)
        print(f'extraction completed for link {idx + 1}. Text saved to {output_file}')
    else:
        print(f'No text was extracted for link {idx + 1}.')

extraction completed for link 1. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\123.0.txt
extraction completed for link 2. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\321.0.txt
extraction completed for link 3. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\432.0.txt
No text was extracted for link 4.
extraction completed for link 5. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\2893.8.txt
extraction completed for link 6. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\3355.6.txt
extraction completed for link 7. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\3817.4.txt
No text was extracted for link 8.
extraction completed for link 9. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\4

extraction completed for link 64. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\30601.8.txt
extraction completed for link 65. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\31063.6.txt
extraction completed for link 66. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\31525.4.txt
extraction completed for link 67. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\31987.2.txt
extraction completed for link 68. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\32449.0.txt
extraction completed for link 69. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\extracted_articles\32910.8.txt
No text was extracted for link 70.
No text was extracted for link 71.
extraction completed for link 72. Text saved to C:\Users\rohit\Python\drive-download-20230909T190744Z-001\ex