# Use Common Crawl Data
> How to use common crawl data? There is 2 ways to achieve this

## Dump-ID
> common crawl dump id related to the date of the crawl. ex: 2023-23

In [1]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'CommonCrawl',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___common_crawl___dump2raw',
            'args': {
                'dump': "2023-23",
                'segment_n': 1,
            }
        },
        {'name': 'data_ingestion___common_crawl___raw2ufl'},
        {'name': 'cleaning___normalization___number'},
        {'name': 'deduplication___common_crawl___exact_line'},
        {
            'name': 'quality___language___fasttext_filter',
            'args': {
                'whitelist': ['ko'],
                'threshold': 0.5,
            }
        },
        {'name': 'data_load___huggingface___ufl2hf_obj'}
    ]
})

print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: CommonCrawl
  driver:
    memory: 16g
etl:
- name: data_ingestion___common_crawl___dump2raw
  args:
    dump: 2023-23
    segment_n: 1
- name: data_ingestion___common_crawl___raw2ufl
- name: cleaning___normalization___number
- name: deduplication___common_crawl___exact_line
- name: quality___language___fasttext_filter
  args:
    whitelist:
    - ko
    threshold: 0.5
- name: data_load___huggingface___ufl2hf_obj



In [2]:
from dataverse.etl import ETLPipeline

etl_pipeline = ETLPipeline()

# raw -> hf_obj
dataset = etl_pipeline.run(ETL_config)
dataset

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/02 09:33:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Starting download of https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/wet.paths.gz




Downloaded https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/wet.paths.gz [200] took 22s (7.1kB/s)


Starting download of https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/segments/1685224650409.64/wet/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz
Downloaded https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-23/segments/1685224650409.64/wet/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz [200] took 7s (16403.0kB/s)
                                                                                

Downloading and preparing dataset spark/-1902204008 to /root/.cache/huggingface/datasets/spark/-1902204008/0.0.0...


                                                                                

Dataset spark downloaded and prepared to /root/.cache/huggingface/datasets/spark/-1902204008/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'meta', 'name', 'text'],
    num_rows: 292
})

In [3]:
dataset[0]

{'id': '7ed7ff60791711ee892062800acfdc4f',
 'meta': '{"title": "[\\ud3ec\\ud1a0] \\ub098\\ub098, \'\\ub9ac\\uc5bc \\ubc14\\ube44\\uc778\\ud615\' - \\uc804\\uc790\\uc2e0\\ubb38", "url": "https://www.etnews.com/20220929000212?mc=ev_002_00003", "date_download": "2023-06-05T00:07:10Z", "digest": "sha1:5RWOEBD2HWDDE3XCZZPFIAOW3AHCKN47", "length": 3179, "nlines": 103, "source_domain": "www.etnews.com", "cc_segment": "crawl-data/CC-MAIN-2023-23/segments/1685224650409.64/wet/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz"}',
 'name': 'common_crawl',
 'text': "allshowTV\nETstudio\n속보\n경제·금융\n전자·모빌리티\n통신·미디어·게임\n소재·부품\nSW·보안\n산업·에너지·환경\n플랫폼·유통\n벤처·바이오\n정치\n국제\n골프\n화제의뉴스\n인사·부음\n오피니언\n특집\n연재\n비주얼IT\n스페셜리포트\n뷰포인트\n인포그래픽\n라이프\n공연전시\n생활문화\n부가서비스\nIT교육지원캠페인\n콘퍼런스\nIT 전시 컨벤션\nET프리미엄\n시사용어\nPDF 서비스\n서비스 안내\n신문구독신청\n온라인광고안내\n콘텐츠구매\n초판서비스\n번역센터\n회원 서비스\n패밀리미디어\n서울신문\nRPM0\nEBN 산업경제신문\n날씨\n[포토] 나나, '리얼 바비인형'\n발행일 : 0000-00-00 00:00\n00일 오전 서울 용산구 한강로 용산 아이파크몰 용산 CGV에서 넷플릭스 시리즈 '글리치' 제작발표회가 열렸다.\n

## WET folder
> use pre-downloaded WET files

We are going to use the cache common crawl as we just downloaded while processing dump-id ETL example right before. Time to use it!

In [4]:
from dataverse.utils.setting import SystemSetting
from pathlib import Path

In [5]:
wet_path = Path(SystemSetting().CACHE_DIR) / '.cache' / 'dataverse' / 'dataset' / 'common_crawl_2023-23'

In [6]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'CommonCrawl',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___common_crawl___wet2raw',
            'args': {
                'wet_path': str(wet_path),
            }
        },
        {'name': 'data_ingestion___common_crawl___raw2ufl'},
        {'name': 'cleaning___normalization___number'},
        {'name': 'deduplication___common_crawl___exact_line'},
        {
            'name': 'quality___language___fasttext_filter',
            'args': {
                'whitelist': ['ko'],
                'threshold': 0.5,
            }
        },
        {'name': 'data_load___huggingface___ufl2hf_obj'}
    ]
})

print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: CommonCrawl
  driver:
    memory: 16g
etl:
- name: data_ingestion___common_crawl___wet2raw
  args:
    wet_path: /root/.cache/dataverse/dataset/common_crawl_2023-23
- name: data_ingestion___common_crawl___raw2ufl
- name: cleaning___normalization___number
- name: deduplication___common_crawl___exact_line
- name: quality___language___fasttext_filter
  args:
    whitelist:
    - ko
    threshold: 0.5
- name: data_load___huggingface___ufl2hf_obj



In [7]:
from dataverse.etl import ETLPipeline

etl_pipeline = ETLPipeline()

# raw -> hf_obj
dataset = etl_pipeline.run(ETL_config)
dataset

                                                                                

Downloading and preparing dataset spark/-1300538864 to /root/.cache/huggingface/datasets/spark/-1300538864/0.0.0...


                                                                                

Dataset spark downloaded and prepared to /root/.cache/huggingface/datasets/spark/-1300538864/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'meta', 'name', 'text'],
    num_rows: 292
})

In [None]:
dataset[0]

{'id': '006dad56791611ee872a62800acfdc4f',
 'meta': '{"title": "\\u0413\\u043e\\u043d\\u0449\\u0438\\u043a\\u0438 Racing Point \\u043e \\u0442\\u0440\\u0430\\u0441\\u0441\\u0435 \\u0432 \\u041c\\u043e\\u043d\\u0430\\u043a\\u043e \\u2014 \\u0410\\u0432\\u0442\\u043e\\u043c\\u043e\\u0431\\u0438\\u043b\\u044c\\u043d\\u044b\\u0439 \\u043f\\u043e\\u0440\\u0442\\u0430\\u043b", "url": "http://barclay-auto.ru/gonshhiki-racing-point-o-trasse-v-monako/", "date_download": "2023-06-05T00:19:25Z", "digest": "sha1:QHIP3XNLR4MN276MCDG5B7OYV6TGOVTW", "length": 4285, "nlines": 42, "source_domain": "barclay-auto.ru", "cc_segment": "crawl-data/CC-MAIN-2023-23/segments/1685224650409.64/wet/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz"}',
 'name': 'common_crawl',
 'text': 'Гонщики Racing Point о трассе в Монако\nВ этот уик-энд должен был пройти седьмой этап сезона – Гран При Монако, однако из-за пандемии коронавируса гонку пришлось отменить. Тем не менее, пресс-служба Racing Point попросила Лэнс

## WET folder - Add MinhashLSH fuzzy deduplication
> same but more preprocessing! 


In [23]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'CommonCrawl',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___common_crawl___wet2raw',
            'args': {
                'wet_path': str(wet_path),
            }
        },
        {'name': 'data_ingestion___common_crawl___raw2ufl'},
        {'name': 'cleaning___normalization___number'},
        {'name': 'deduplication___minhash___lsh_jaccard'},
        {'name': 'deduplication___common_crawl___exact_line'},
        {
            'name': 'quality___language___fasttext_filter',
            'args': {
                'whitelist': ['ko'],
                'threshold': 0.5,
            }
        },
        {'name': 'data_load___huggingface___ufl2hf_obj'}
    ]
})

print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: CommonCrawl
  driver:
    memory: 16g
etl:
- name: data_ingestion___common_crawl___wet2raw
  args:
    wet_path: /root/.cache/dataverse/dataset/common_crawl_2023-23
- name: data_ingestion___common_crawl___raw2ufl
- name: cleaning___normalization___number
- name: deduplication___minhash___lsh_jaccard
- name: deduplication___common_crawl___exact_line
- name: quality___language___fasttext_filter
  args:
    whitelist:
    - ko
    threshold: 0.5
- name: data_load___huggingface___ufl2hf_obj



In [24]:
from dataverse.etl import ETLPipeline

etl_pipeline = ETLPipeline()

# raw -> hf_obj
dataset = etl_pipeline.run(ETL_config)
dataset

23/11/02 09:53:07 WARN CacheManager: Asked to cache already cached data.        
                                                                                

Downloading and preparing dataset spark/1061218979 to /root/.cache/huggingface/datasets/spark/1061218979/0.0.0...


                                                                                

Dataset spark downloaded and prepared to /root/.cache/huggingface/datasets/spark/1061218979/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'meta', 'name', 'text'],
    num_rows: 287
})

In [20]:
dataset[0]

{'id': '5c63d632791911ee8d2062800acfdc4f',
 'meta': '{"title": "Advanced Rail Concepts > Roto Gripp Bucket", "url": "http://advancedrailconcepts.com/RotoGrippBucket.aspx", "date_download": "2023-06-05T01:08:24Z", "digest": "sha1:D27CRHQDVRYISILRFFDETNGFPDZLNSMO", "length": 1746, "nlines": 61, "source_domain": "advancedrailconcepts.com", "cc_segment": "/root/.cache/dataverse/dataset/common_crawl_2023-23/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz"}',
 'name': 'common_crawl',
 'text': 'Home Rail X Roto Gripp Bucket Options Surplus Awards About Us Contact Us\nRail X\nRoto Gripp Bucket\nSurplus\nSunday , June , 00 , 0000\nThe One Bucket... that SHOUTS Mobility & Versatility...\n"So when you want to get a grip... Get the Roto GrippTM"\nThis Unique Tool...\nis by far the best attachment available for the Rail X and any other standard excavator. This full 000 degree rotating jaw bucket has the ability to make efficient detailed adjustments in many directions standing still or on t