## Bibliotecas Necessárias

In [1]:
import re
import os
import sqlite3
import requests
import pandas as pd

from bs4 import BeautifulSoup
from time import time
from datetime import datetime

## Conexão com DB SQLite3

In [2]:
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn

## Scraper > Dados no DataFrame > Exportando CSV > Inserindo no DB SQLite3

In [3]:
base_url = 'https://pastebin.com' # main page
url_all = 'https://pastebin.com/archive' # list all pastes

list_lang = ['python','java','javascript','c','html4strict','php','lua']

for lang in list_lang:
    url_lang = f'https://pastebin.com/archive/{lang}' # specific search lang
    response = requests.get(url_lang)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_="maintable")

    url_list = []
    print('Extraindo dados: ', url_lang)
    
    for link in table.select('tbody tr td span ~ a'):
        item_link = base_url + link.get('href')
        raw_link = base_url + link.get('href')

        response = requests.get(raw_link)
        all_text = response.text

        text_processed = re.sub('[^A-Za-z0-9]+', ' ', all_text)  

        text_splitted = text_processed.split()
        word_count = len(text_splitted)

        unique_text = set(text_splitted)
        word_diff_count = len(unique_text)

        response = requests.get(base_url + link.get('href'))
        soup = BeautifulSoup(response.content, 'html.parser')

        search = soup.select_one('div.highlighted-code > div.top-buttons > div.left > a')
        syntax = search.text

        now = time()
        now_time = datetime.utcfromtimestamp(now).strftime('%Y-%m-%d %H:%M:%S')

        url_list.append([item_link, word_count, word_diff_count, syntax, now_time])
        
    df = pd.DataFrame(data = url_list, columns = ['page_link', 'word_count', 'word_diff_count', 'syntax' ,'pickup_time'])
        
    df.to_csv('data_scraping.csv', index = False, mode = 'a', header = not os.path.exists('data_scraping.csv'))

    conn = create_connection('scraping.db')
        
    df.to_sql(name = 'pastebin', index = False, con = conn, if_exists = 'append')
        
    conn.close()

Extraindo dados:  https://pastebin.com/archive/python
Extraindo dados:  https://pastebin.com/archive/java
Extraindo dados:  https://pastebin.com/archive/javascript
Extraindo dados:  https://pastebin.com/archive/c
Extraindo dados:  https://pastebin.com/archive/html4strict
Extraindo dados:  https://pastebin.com/archive/php
Extraindo dados:  https://pastebin.com/archive/lua
