In [1]:
import torch
import resiliparse
from resiliparse import parse
from cs336_data.utils import extract_text_from_html_bytes

from warcio.archiveiterator import ArchiveIterator
import gzip
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.encoding import detect_encoding


In [2]:
warc_file_path = '../data/CC-MAIN-20180420081400-20180420101400-00000.warc.gz'
wet_file_path = '../data/CC-MAIN-20180420081400-20180420101400-00000.warc.wet.gz'

In [3]:
def load_html_by_uri_from_warc(warc_path, target_uri):
    with gzip.open(warc_path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                record_uri = record.rec_headers.get_header('WARC-Target-URI')
                if record_uri == target_uri:
                    raw_html = record.content_stream().read()
                    return raw_html, record_uri
    return None, None

def load_html_from_warc(warc_path, doc_index=0):
    current_idx = -1
    with gzip.open(warc_path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            # Count only 'response' records with 'text/html' content
            if record.rec_type == 'response':
                content_type = record.http_headers.get_header('Content-Type', '').lower()
                if 'text/html' in content_type:
                    current_idx += 1
                    if current_idx == doc_index:
                        raw_html = record.content_stream().read()
                        url = record.rec_headers.get_header('WARC-Target-URI')
                        return raw_html, url
    return None, None

In [4]:
def load_text_and_url_from_wet(wet_path, doc_index=0):
    current_idx = -1
    content = []
    is_target_doc = False
    target_uri = None

    with gzip.open(wet_path, 'rt', encoding='utf-8') as file:
        for line in file:
            if line.startswith('WARC/1.0'):
                if is_target_doc:
                    break
                current_idx += 1
                content = []
                is_target_doc = False
                target_uri = None
                continue

            if line.startswith('WARC-Target-URI:'):
                target_uri = line.split('WARC-Target-URI: ')[1].strip()
                continue

            if line.strip() == '' and not is_target_doc:
                if current_idx == doc_index:
                    is_target_doc = True
                continue

            if is_target_doc:
                content.append(line.rstrip())

    return '\n'.join(content).strip(), target_uri


def load_text_from_wet(wet_path, doc_index=0):
    current_idx = -1
    content = []
    is_target_doc = False

    with gzip.open(wet_path, 'rt', encoding='utf-8') as file:
        for line in file:
            if line.startswith('WARC/1.0'):
                if is_target_doc:
                    break
                current_idx += 1
                content = []
                is_target_doc = False
                continue

            if line.strip() == '' and not is_target_doc:
                if current_idx == doc_index:
                    is_target_doc = True
                continue

            if is_target_doc:
                content.append(line.rstrip())

    return '\n'.join(content).strip()


In [5]:
doc_index = 1

# Load and decode HTML content from WARC file
html_bytes, url = load_html_from_warc(warc_file_path, doc_index)

if html_bytes:
    try:
        html_str = html_bytes.decode('utf-8')
    except UnicodeDecodeError:
        detected_encoding = detect_encoding(html_bytes)
        html_str = html_bytes.decode(detected_encoding, errors='replace')

    print("URL:", url)
    print("HTML content (first 1000 characters):")
    print(html_str[:1000])
else:
    print("No HTML content found in WARC file.")

# Load plain text content from WET file
wet_text = load_text_from_wet(wet_file_path, doc_index)

print("\nPlain text from WET file (first 1000 characters):")
print(wet_text[:1000])


URL: http://00monthly.com/kouchi/goiken/
HTML content (first 1000 characters):
<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ja" xml:lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=shift_jis" />
<title>ウィークリーマンション、マンスリーマンション高知の00マンスリードットコム</title>
<meta http-equiv="content-style-type" content="text/css" />
<meta http-equiv="content-script-type" content="text/javascript" />
<meta name="keywords" content="ウィークリーマンション,マンスリーマンション,高知,敷金,礼金,0,ゼロ,物件" />
<meta name="description" content="高知でウィークリーマンション、マンスリーマンション物件をお探しなら00マンスリードットコムで敷金礼金0の家具付き賃貸で仮住まい" />
<link href="/index.html" rel="index" />
<link href="/common/css/import.css" rel="stylesheet" type="text/css" />
<link href="/common/css/index.css" rel="stylesheet" type="text/css" />
</head>
<body>

<!--HEADER-->
<div id="header">

		<h1>ウィークリーマンション、

In [6]:
print("HTML content:")
print(html_str)

HTML content:
<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="ja" xml:lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=shift_jis" />
<title>ウィークリーマンション、マンスリーマンション高知の00マンスリードットコム</title>
<meta http-equiv="content-style-type" content="text/css" />
<meta http-equiv="content-script-type" content="text/javascript" />
<meta name="keywords" content="ウィークリーマンション,マンスリーマンション,高知,敷金,礼金,0,ゼロ,物件" />
<meta name="description" content="高知でウィークリーマンション、マンスリーマンション物件をお探しなら00マンスリードットコムで敷金礼金0の家具付き賃貸で仮住まい" />
<link href="/index.html" rel="index" />
<link href="/common/css/import.css" rel="stylesheet" type="text/css" />
<link href="/common/css/index.css" rel="stylesheet" type="text/css" />
</head>
<body>

<!--HEADER-->
<div id="header">

		<h1>ウィークリーマンション、マンスリーマンション 高知</h1>
	<h2>
	ウィークリー、マンスリーマンションで</h2>
		<p id="paragr

In [None]:
# Get text and corresponding URI from WET file
doc_index = 0
wet_text, target_uri = load_text_and_url_from_wet(wet_file_path, doc_index)

if target_uri:
    print("Matched URL from WET file:", target_uri)

    # Get corresponding HTML from WARC file using URI
    html_bytes, warc_uri = load_html_by_uri_from_warc(warc_file_path, target_uri)

    if html_bytes:
        print("\nHTML snippet from WARC file:")
        print(html_bytes[:1000].decode('utf-8', errors='replace'))
    else:
        print("No matching HTML found in WARC for given URI.")
else:
    print("No URI found in WET document.")

print("\nWET text snippet:")
print(wet_text[:1000])


Matched URL from WET file: http://080.roomadult.net/index.phtml?PUT=a_show&AID=80690&FID=629256&R2=&CHANNEL=

HTML snippet from WARC file:
<html>

<head>
<title>
</title>ͺ�
<meta http-equiv="PICS-Label" content='(PICS-1.1 "http://www.ticrf.org.tw/chinese/html/06-rating-v11.htm" l gen true for "http://080.roomadult.net" r (s 3 l 3 v 3 o 0))'>
<meta http-equiv=content-type content="text/html; charset=big5">
<meta name="Keywords" content="">
<meta name="description" content="">
<style type="text/css"><!--
.text {
	font-size: 12px;
	line-height: 15px;
	color: #000000;
	text-decoration: none;
}

.link {
	font-size: 12px;
	line-height: 15px;
	color: #000000;
	text-decoration: underline;
}

.link_menu {
	font-size: 15px;
	line-height: 24px;
	text-decoration: underline;
}

.SS {font-size: 10px;line-height: 14px;}
.S {font-size: 11px;line-height: 16px;}
.M {font-size: 13px;line-height: 18px;}
.L {font-size: 15px;line-height: 20px;}
.LL {font-size: 17px;line-height: 22px;}


.t01 {
	font-family:

In [19]:
print("\nHTML snippet from WARC file:")
print(html_bytes.decode('utf-8', errors='replace'))


HTML snippet from WARC file:
<html>

<head>
<title>
</title>ͺ�
<meta http-equiv="PICS-Label" content='(PICS-1.1 "http://www.ticrf.org.tw/chinese/html/06-rating-v11.htm" l gen true for "http://080.roomadult.net" r (s 3 l 3 v 3 o 0))'>
<meta http-equiv=content-type content="text/html; charset=big5">
<meta name="Keywords" content="">
<meta name="description" content="">
<style type="text/css"><!--
.text {
	font-size: 12px;
	line-height: 15px;
	color: #000000;
	text-decoration: none;
}

.link {
	font-size: 12px;
	line-height: 15px;
	color: #000000;
	text-decoration: underline;
}

.link_menu {
	font-size: 15px;
	line-height: 24px;
	text-decoration: underline;
}

.SS {font-size: 10px;line-height: 14px;}
.S {font-size: 11px;line-height: 16px;}
.M {font-size: 13px;line-height: 18px;}
.L {font-size: 15px;line-height: 20px;}
.LL {font-size: 17px;line-height: 22px;}


.t01 {
	font-family: "�s�ө���";
	font-size: 12px;
	color: #FFFFFF;
}
.t02 {
	font-family: "�s�ө���";
	font-size: 12px;
	line-heig

In [14]:
print("\nPlain text from WET file:")
print(wet_text)


Plain text from WET file:
視訊聊天交友網
24HR客服專線:02-27654066
免費文字聊天區 一對一視訊聊天 / 免費一對多視訊 搜尋主持人：
業績排行 │ 分鐘數排行 │ 本站推薦 │ 本月新人 │ 一對多收費排序 │ 一對一收費排序
恭喜三月份消費排行前十名會員 獲得免費點數~
No.1 LV26451** - 贈點10,000 點 No.2 LV50194** - 贈點9,000 點
No.3 LV 6255** -贈點8,000 點 No.4 LV18778** -贈點7,000 點 No.5 LV50702** -贈點6,000 點 No.6 LV44113** -贈點5,000 點 No.7 LV4946** -贈點4,000 點
No.8 LV48350** -贈點3,000 點 No.9 LV46832** -贈點2,000 點 No.10 LV49043** -贈點1,000 點
~ 感謝大家對本平台的支持與愛護, 也恭喜以上會員
(含哆露) 我在休息 最近上線時間 : 2018-04-20 05:21:08
加到我的最愛 | 說明
詳細資料
免費文字聊天: 聊天需要付費 血型:
一對多視訊聊天: 每分鐘 8 點 身高: 公分(cm)
一對一視訊聊天: 每分鐘 35 點 體重: 公斤(kg)
性別: 女性 三圍:
年齡: 28 歲 區域: 高雄市
會員總評價
相貌 平均評價 4.89 分
身材 平均評價 4.89 分
表演 平均評價 5.00 分
態度 平均評價 5.00 分
註﹕最高值 5分
會員評價
相貌 身材 表演 態度
會員[ LV2530335 ] 如夢似幻 的評論：好聊天 身材太好嚕 又可以夾龜龜..( 2018-04-12 00:10:40 )
相貌 身材 表演 態度
會員[ LV4178684 ] 幻想等於期待 的評論：太陽出來了- -晚安 ( 2018-04-07 05:49:13 )
相貌 身材 表演 態度
會員[ LV2489108 ] 8851141 的評論：超棒妹妹 好好珍惜了( 2018-04-06 00:18:30 )
相貌 身材 表演 態度
會員[ LV5081466 ] 匿名 的評論：( 2018-03-20 23:56:14 )
相貌 身材 表演 態度
會員[ LV4119801