# Use Common Crawl Data
> How to use common crawl data? There is 2 ways to achieve this

## 🌌 Dump-ID
> common crawl dump id related to the date of the crawl. ex: 2023-23

In [1]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'CommonCrawl',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___common_crawl___dump2raw',
            'args': {
                'dump': "2023-23",
                'segment_n': 1,
            }
        },
        {'name': 'data_ingestion___common_crawl___raw2ufl'},
        {'name': 'cleaning___normalization___number'},
        {'name': 'deduplication___common_crawl___exact_line'},
        {
            'name': 'quality___language___fasttext_filter',
            'args': {
                'whitelist': ['ko'],
                'threshold': 0.5,
            }
        },
        {'name': 'data_load___huggingface___ufl2hf_obj'}
    ]
})

print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: CommonCrawl
  driver:
    memory: 16g
etl:
- name: data_ingestion___common_crawl___dump2raw
  args:
    dump: 2023-23
    segment_n: 1
- name: data_ingestion___common_crawl___raw2ufl
- name: cleaning___normalization___number
- name: deduplication___common_crawl___exact_line
- name: quality___language___fasttext_filter
  args:
    whitelist:
    - ko
    threshold: 0.5
- name: data_load___huggingface___ufl2hf_obj



In [2]:
from dataverse.etl import ETLPipeline

etl_pipeline = ETLPipeline()

# raw -> hf_obj
spark, dataset = etl_pipeline.run(ETL_config)
dataset

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/14 22:09:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/14 22:09:41 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
                                                                                

Downloading and preparing dataset spark/-572665896 to /root/.cache/huggingface/datasets/spark/-572665896/0.0.0...


                                                                                

Dataset spark downloaded and prepared to /root/.cache/huggingface/datasets/spark/-572665896/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'meta', 'name', 'text'],
    num_rows: 292
})

In [3]:
dataset[0]

{'id': '19ee2ac082ef11eeae4262800acfdc4f',
 'meta': '{"title": "\\uc640\\uae00\\uc640\\uae00 - \\uc7ac\\ubbf8", "url": "http://wagle.isplus.joins.com/app/index.php?mid=wg_fun&page=6", "date_download": "2023-06-05T00:45:09Z", "digest": "sha1:UDASCLMI7FRAUR5PKBHJZ6DZSBZPZTFI", "length": 2557, "nlines": 45, "source_domain": "wagle.isplus.joins.com", "cc_segment": "crawl-data/CC-MAIN-2023-23/segments/1685224650409.64/wet/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz"}',
 'name': 'common_crawl',
 'text': "조인스\n와글와글 전체 목록\n조회\n0000 매경기 재평가되는 맨유짤 더레즈 0000-00-00\n0000 아스날이 0경기 0승을 한 이유? 구0000너 0000-00-00\n0000 은근 축구 혼자서 다하는 선수 풋스타 0000-00-00\n0000 [오피셜] 아스날, 리그 0위로 0라운드 종료 아스날아.. 0000-00-00\n0000 [놀람] 놀랄 수 밖에 없는 첼시 선발라인업 케파멘디 0000-00-00\n0000 [감동] 분데스리가 00번의 과거와 미래 포항항 0000-00-00\n0000 ???:너네들 재미있어보이네~ 에밀홀딩 0000-00-00\n0000 [정보]0000년 0회 이상 우승한팀 어우뮌x0 0000-00-00\n0000 커뮤니티실드에서의 리버풀 해리킼웰 0000-00-00\n0000 [유머] ?? : 아스날... 생각보다 강팀이잖아..? 금발롱 0000-00-00\n0000 (감동)??:우....승...뭐라고? 티아구메.. 00

## 🌌 WET folder
> use pre-downloaded WET files

We are going to use the cache common crawl as we just downloaded while processing dump-id ETL example right before. Time to use it!

In [4]:
from dataverse.utils.setting import SystemSetting
from pathlib import Path

In [5]:
wet_path = Path(SystemSetting().CACHE_DIR) / '.cache' / 'dataverse' / 'dataset' / 'common_crawl_2023-23'

In [6]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'CommonCrawl',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___common_crawl___wet2raw',
            'args': {
                'wet_path': str(wet_path),
            }
        },
        {'name': 'data_ingestion___common_crawl___raw2ufl'},
        {'name': 'cleaning___normalization___number'},
        {'name': 'deduplication___common_crawl___exact_line'},
        {
            'name': 'quality___language___fasttext_filter',
            'args': {
                'whitelist': ['ko'],
                'threshold': 0.5,
            }
        },
        {'name': 'data_load___huggingface___ufl2hf_obj'}
    ]
})

print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: CommonCrawl
  driver:
    memory: 16g
etl:
- name: data_ingestion___common_crawl___wet2raw
  args:
    wet_path: /root/.cache/dataverse/dataset/common_crawl_2023-23
- name: data_ingestion___common_crawl___raw2ufl
- name: cleaning___normalization___number
- name: deduplication___common_crawl___exact_line
- name: quality___language___fasttext_filter
  args:
    whitelist:
    - ko
    threshold: 0.5
- name: data_load___huggingface___ufl2hf_obj



In [7]:
from dataverse.etl import ETLPipeline

etl_pipeline = ETLPipeline()

# raw -> hf_obj
spark, dataset = etl_pipeline.run(ETL_config)
dataset

23/11/14 22:10:11 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


                                                                                

Downloading and preparing dataset spark/-1399168669 to /root/.cache/huggingface/datasets/spark/-1399168669/0.0.0...


                                                                                

Dataset spark downloaded and prepared to /root/.cache/huggingface/datasets/spark/-1399168669/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'meta', 'name', 'text'],
    num_rows: 292
})

In [8]:
dataset[0]

{'id': '29551d2082ef11eea9d462800acfdc4f',
 'meta': '{"title": "\\uc640\\uae00\\uc640\\uae00 - \\uc7ac\\ubbf8", "url": "http://wagle.isplus.joins.com/app/index.php?mid=wg_fun&page=6", "date_download": "2023-06-05T00:45:09Z", "digest": "sha1:UDASCLMI7FRAUR5PKBHJZ6DZSBZPZTFI", "length": 2557, "nlines": 45, "source_domain": "wagle.isplus.joins.com", "cc_segment": "/root/.cache/dataverse/dataset/common_crawl_2023-23/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz"}',
 'name': 'common_crawl',
 'text': "조인스\n와글와글 전체 목록\n조회\n0000 매경기 재평가되는 맨유짤 더레즈 0000-00-00\n0000 아스날이 0경기 0승을 한 이유? 구0000너 0000-00-00\n0000 은근 축구 혼자서 다하는 선수 풋스타 0000-00-00\n0000 [오피셜] 아스날, 리그 0위로 0라운드 종료 아스날아.. 0000-00-00\n0000 [놀람] 놀랄 수 밖에 없는 첼시 선발라인업 케파멘디 0000-00-00\n0000 [감동] 분데스리가 00번의 과거와 미래 포항항 0000-00-00\n0000 ???:너네들 재미있어보이네~ 에밀홀딩 0000-00-00\n0000 [정보]0000년 0회 이상 우승한팀 어우뮌x0 0000-00-00\n0000 커뮤니티실드에서의 리버풀 해리킼웰 0000-00-00\n0000 [유머] ?? : 아스날... 생각보다 강팀이잖아..? 금발롱 0000-00-00\n0000 (감동)??:우....승...뭐라고? 티아구메.. 0000-00

## 🌌 WET folder - Add MinhashLSH fuzzy deduplication
> same but more preprocessing! 


In [9]:
from omegaconf import OmegaConf

# load from dict
ETL_config = OmegaConf.create({
    'spark': {
        'appname': 'CommonCrawl',
        'driver': {'memory': '16g'},
    },
    'etl': [
        {
            'name': 'data_ingestion___common_crawl___wet2raw',
            'args': {
                'wet_path': str(wet_path),
            }
        },
        {'name': 'data_ingestion___common_crawl___raw2ufl'},
        {'name': 'cleaning___normalization___number'},
        {'name': 'deduplication___minhash___lsh_jaccard'},
        {'name': 'deduplication___common_crawl___exact_line'},
        {
            'name': 'quality___language___fasttext_filter',
            'args': {
                'whitelist': ['ko'],
                'threshold': 0.5,
            }
        },
        {'name': 'data_load___huggingface___ufl2hf_obj'}
    ]
})

print(OmegaConf.to_yaml(ETL_config))

spark:
  appname: CommonCrawl
  driver:
    memory: 16g
etl:
- name: data_ingestion___common_crawl___wet2raw
  args:
    wet_path: /root/.cache/dataverse/dataset/common_crawl_2023-23
- name: data_ingestion___common_crawl___raw2ufl
- name: cleaning___normalization___number
- name: deduplication___minhash___lsh_jaccard
- name: deduplication___common_crawl___exact_line
- name: quality___language___fasttext_filter
  args:
    whitelist:
    - ko
    threshold: 0.5
- name: data_load___huggingface___ufl2hf_obj



In [10]:
from dataverse.etl import ETLPipeline

etl_pipeline = ETLPipeline()

# raw -> hf_obj
spark, dataset = etl_pipeline.run(ETL_config)
dataset

23/11/14 22:10:34 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


                                                                                

Downloading and preparing dataset spark/2085970941 to /root/.cache/huggingface/datasets/spark/2085970941/0.0.0...


                                                                                

Dataset spark downloaded and prepared to /root/.cache/huggingface/datasets/spark/2085970941/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['id', 'meta', 'name', 'text'],
    num_rows: 285
})

In [11]:
dataset[0]

{'id': '3aa3dddc82ef11ee898d62800acfdc4f',
 'meta': '{"title": "\\ub3d9\\uc601\\uc0c1 | \\uc6b0\\ub9ac \\ud568\\uaed8 \\ub9cc\\ub4e4\\uc5b4 \\ubd05\\uc2dc\\ub2e4.(57.\\ud478\\ucd08\\ubcf6\\uc74c)\\u200b", "url": "https://dprktoday.com/videos/16055?list=", "date_download": "2023-06-05T01:01:45Z", "digest": "sha1:6TKZ4VWGQESC6HVNGQS3ESIE4BR63V25", "length": 4007, "nlines": 317, "source_domain": "dprktoday.com", "cc_segment": "/root/.cache/dataverse/dataset/common_crawl_2023-23/CC-MAIN-20230604225057-20230605015057-00644.warc.wet.gz"}',
 'name': 'common_crawl',
 'text': '첫페지로\n날자별열람\n손전화홈페지열람기\n조선어 English 中国语 Русский\n정치\n경제\n군사\n사회문화\n조국통일\n관광\n력사\n로작\n기 사\n동영상\n사 진\n음악감상\n전체\n혁명활동소식\n기록영화\n회고록《세기와 더불어》\n《조선의 오늘》동영상\n조선중앙TV\nU C C\n국제친선전람관을 찾아서 |\n국가선물관을 찾아서 |\n특집 |\n생활의 랑만과 정서 |\n미덕의 향기 |\n인물소개 |\n예술공연 |\n아동무대 |\n조선영화 |\nTV예술영화 |\nTV련속소설 |\nTV련속극 |\nTV극 |\nTV기록영화 |\nTV기록편집물 |\n사이프로편집물 |\n만화영화 |\n인기동영상 |\n화면취재시간 |\n민족의 자취를 찾아서 |\n우리함께 |\n조선의 숨결 |\n이 시각 평양, 그 한토막 |\n나는 좋아요 |\n료리백과 |\n[료리