In [1]:
!pip install beautifulsoup4
!pip install numpy
!pip install requests
!pip install spacy
!pip install trafilatura

Collecting trafilatura
[?25l  Downloading https://files.pythonhosted.org/packages/a0/eb/3725c95c55cf4f40c842f393e4310d569ca7efa7066f8ca7bcda757e27c7/trafilatura-0.8.2-py3-none-any.whl (165kB)
[K     |████████████████████████████████| 174kB 6.4MB/s 
[?25hCollecting urllib3<2,>=1.25
[?25l  Downloading https://files.pythonhosted.org/packages/0c/cd/1e2ec680ec7b09846dc6e605f5a7709dfb9d7128e51a026e7154e18a234e/urllib3-1.26.5-py2.py3-none-any.whl (138kB)
[K     |████████████████████████████████| 143kB 8.7MB/s 
[?25hCollecting lxml>=4.6.2; python_version > "3.4"
[?25l  Downloading https://files.pythonhosted.org/packages/30/c0/d0526314971fc661b083ab135747dc68446a3022686da8c16d25fcf6ef07/lxml-4.6.3-cp37-cp37m-manylinux2014_x86_64.whl (6.3MB)
[K     |████████████████████████████████| 6.3MB 7.2MB/s 
Collecting readability-lxml>=0.8.1
  Downloading https://files.pythonhosted.org/packages/39/a6/cfe22aaa19ac69b97d127043a76a5bbcb0ef24f3a0b22793c46608190caa/readability_lxml-0.8.1-py3-none-any.w

In [2]:
from bs4 import BeautifulSoup
import json
import numpy as np
import pandas as pd
import glob
import os
import requests
from requests.models import MissingSchema
import spacy
import trafilatura
import time



## **Extracting Text**

In [3]:
def beautifulsoup_extract_text_fallback(response_content):
    
    # Create the beautifulsoup object:
    soup = BeautifulSoup(response_content, 'html.parser')
    
    # Finding the text:
    text = soup.find_all(text=True)
    
    # Remove unwanted tag elements:
    cleaned_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
    # is NOT in the blacklist
    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)
            
    # Remove any tab separation and strip the text:
    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()
    

def extract_text_from_single_web_page(url):
    
    downloaded_url = trafilatura.fetch_url(url)
    
    try:
        a = trafilatura.extract(downloaded_url, output_format='json', with_metadata=False, include_comments = False, include_images = False,                  
              include_tables = False, include_links = False , date_extraction_params={'extensive_search': True, 'original_date': True})
        
    except AttributeError:
        a = trafilatura.extract(downloaded_url, output_format='json', with_metadata=False, include_comments = False, include_images = False,            
                    include_tables = False, include_links = False, date_extraction_params={'extensive_search': True, 'original_date': True})
    
    if a:
        json_output = json.loads(a)
        return json_output['text']
    else:
        try:
            resp = requests.get(url)
            # We will only extract the text from successful requests:
            if resp.status_code == 200:
                return beautifulsoup_extract_text_fallback(resp.content)
            else:
                # This line will handle for any failures in both the Trafilature and BeautifulSoup4 functions:
                print('None')
                return np.nan
        # Handling for any URLs that don't have the correct protocol
        except requests.exceptions.RequestException as e:
            print(e)
            return np.nan
        

## **CSV Column Dropping**

In [None]:
real_path = r'/content/drive/MyDrive/Try CSV/Real'
edited_path = r'/content/drive/MyDrive/Try CSV/Edited/'
all_files = glob.glob(real_path + "/*.csv")
#all_files

columnsToDelete = ["User Name", "Facebook Id", "Page Created", "Likes at Posting", "Followers at Posting", "Total Interactions", "Likes", "Comments", "Shares", "Love", "Wow",
                   "Haha", "Sad", "Angry", "Care", "Is Video Owner?", "Post Views", "Total Views", "Total Views For All Crossposts", "Video Length",
                   "Sponsor Id", "Sponsor Name", "Sponsor Category", "Total Interactions (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )",
                   "Overperforming Score" ]

# get all file names
file_name=[]
for files in all_files:
  file_w_ext = (os.path.basename(files))
  fname, ext = os.path.splitext(file_w_ext)
  file_name.append(fname)

#file_name

# drop columns and create new 'Label' column
for i in range(0, len(all_files)):
  csv_file = pd.read_csv(all_files[i],encoding='utf-8')
  dropped_file = csv_file.drop(columnsToDelete, axis=1, inplace=False)
  dropped_file['Label'] = np.nan 
  dropped_file.to_csv(edited_path + file_name[i] + '_edited.csv', index=False, encoding='utf-8')
  print(edited_path + file_name[i] + '_edited.csv created')

## **Writing Description in  CSV**

In [None]:
filePath = '/content/drive/MyDrive/Try CSV/Edited/' 
editedPath = '/content/drive/MyDrive/Try CSV/Crawled/'

all_files = glob.glob(filePath + "/*.csv")
#all_files

# get all file names
file_name=[]
for files in all_files:
  file_w_ext = (os.path.basename(files))
  fname, ext = os.path.splitext(file_w_ext)
  file_name.append(fname)

#file_name

unreachableSites = ['http://cumillabarta.com', 'https://www.analysisbd.net', 'https://bit.ly', 'https://www.hasivalobashi.club', 
                    'https://www.bengalbreakingnews.com', 'https://dailymorning24.com', 'https://www.sangbad24x7.com/', 
                    'http://www.naturalhealthtips.us/', 'https://kalerdarpan24.com', 'https://notunalo.press/', 
                    'https://www.timeofkushtia.com/', 'https://somoybd24.info/', 'https://www.sarakhon.com/']

for j in range(len(all_files)):
  df = pd.read_csv(all_files[j],encoding='utf-8')
  for i in range(len(df)):
    print(i)

    #if Status or Link is of Facebook, dont do anything
    if df.loc[i, 'Type'] == 'Status' or 'https://www.facebook.com' in df.loc[i, 'Link'] :
      print('Status Type')
      print('Continuing')
      continue
      #URL = df.loc[i, 'URL']

    elif df.loc[i, 'Type'] == 'Link':
      print('Link Type')
      URL = df.loc[i, 'Link']
      # if 'https://www.facebook.com' in URL:
      #   print('Facebook')
      #   print('Continuing')
      #   continue 

    else :
      df.drop([i], inplace=True)
      print('Other type')
      print('dropped')
      continue
    
    print(URL)

    if any(x in URL for x in unreachableSites ):
      df.drop([i], inplace = True )
      print('dropped as unreachable')
      continue


    try:
      text = extract_text_from_single_web_page(url=URL)
    except KeyboardInterrupt as e:
      print('Key pressed')
      df.drop([i], inplace = True )
      print('dropped as key pressed')
      continue
      
    

    #  if the url is unreachable, drop it
    if text is np.nan:
      df.drop([i], inplace = True )
      print('dropped')
      continue

    # replace Description with text extracted from the link
    df.replace(to_replace = df.loc[i, 'Description'], 
                 value = text, 
                  inplace = True)
    print("Description Updated")
    #print(df.loc[i,'Description'])

  df.to_csv(editedPath + file_name[j] +  'crawled.csv', index=False, encoding='utf-8')
  print(editedPath + file_name[j] +  'crawled.csv created')

# **Testing**

In [None]:
single_url = 'https://www.analysisbd.net/archives/15992'

text = extract_text_from_single_web_page(url=single_url)

    
print(text)
if text is np.nan:
  print('NAN')