In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import json
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import pymongo
# connect to mongodb client


In [3]:
!pip install kaggle
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (603 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m603.6/603.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.4.2 pymongo-4.4.1


Crawler

In [5]:
class Crawler:
    def __init__(self,save_path):
        self.save_path=save_path #存储路径
        self.url="" #每个URL不同
        self.dt={}
        #connection string will not be shown explicitly
        client = pymongo.MongoClient("")
        db = client['Crawl-Data']
        self.collection = db['metadata']
    def save(self):
        with open(self.save_path, 'w') as f:
            f.write(json.dumps(self.dt))

UCI

In [6]:
class UCI(Crawler):
    def __init__(self, save_path):
        super().__init__(save_path)
        self.url="https://archive.ics.uci.edu/ml/datasets.php"
    def process_data(self):
        urls = set()
        URL = "https://archive.ics.uci.edu/datasets?skip=0&take=1000&sort=desc&orderBy=NumHits&search=" # assuming uci archive size no more than 1k
        page = requests.get(URL)
        soup = bs(page.content, "html.parser")
        for h2 in soup.find_all('h2'):
            urls.add(h2.find('a')['href'])
        dt = pd.DataFrame(columns = ["id", "url"])
        page0 = requests.get("https://archive.ics.uci.edu" + list(urls)[0])
        soup0 = bs(page0.text, "html.parser")
        tb = pd.DataFrame(columns = ["Title","Description"] + [i.text for i in soup0.find_all('h1')][1:7])
        for url in tqdm(list(urls)):
            dt.loc[len(dt)] = [url[url.rfind("/")+1:], "https://archive.ics.uci.edu" + url]
            page0 = requests.get("https://archive.ics.uci.edu" + url)
            soup0 = bs(page0.text, "html.parser")
            txt2 = soup0.find_all(['h1','p'])
            tb.loc[len(tb)] = [soup0.find('h1').text] + [i.text for i in soup0.find_all('p')][:7]
        self.dt= json.loads(pd.concat([dt, tb], axis = 1).to_json(orient = "records"))
        #self.save
    def upload_data(self):
        count = 0
        for data in self.dt:
            # exists duplicate names, but different datasets
            existing_data = self.collection.find_one({"url": data["url"]})
            if existing_data is None:
                # Data is not in the collection, so insert it
                self.collection.insert_one(data)
                count += 1
        print('Inserted', count, "number of data")

Kaggle

In [None]:
class Kaggle(Crawler):
    def __init__(self, save_path,save_metadata):
        super().__init__(save_path)
        self.url="https://archive.ics.uci.edu/ml/datasets.php"
    def upload_data(self):
        api = KaggleApi()

        #os.environ['KAGGLE_CONFIG_DIR'] = '/content'
        api.authenticate()
        # Fetch the list of datasets, now we are fetching 1000, 20 per page
        count = 0
        for i in range(1,501):
            datasets = api.dataset_list(page=i)
            for dataset in datasets:
                  #need to download the metadatafile and read it each time to get information
                try:
                        metadata = api.dataset_metadata(dataset.ref, path = '/content/metadata.json')
                except:
                    continue
                with open('/content/metadata.json/dataset-metadata.json') as metadata_file:
                        metadata = json.load(metadata_file)
                        metadata['name'] = dataset.ref
                        metadata['url'] = dataset.url
                        #add Kaggle data to mongodb
                        existing_data = self.collection.find_one({"url": dataset.url})
                        if existing_data is None:
                          # Data is not in the collection, so insert it
                            self.collection.insert_one(metadata)
                            count += 1
        # Print dataset names and URLs
        #self.save()

Test

In [7]:
P=UCI("/Referene.txt")
P.process_data()
P.upload_data()

100%|██████████| 644/644 [06:10<00:00,  1.74it/s]


Inserted 0 number of data


In [None]:
Kaggleset=Kaggle("/Referene.txt")
Kaggleset.upload_data()