## Preprocessing Script

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)
import jsonlines
import glob
from tqdm.auto import tqdm
from transformers import AutoTokenizer

In [2]:
fnames = glob.glob('../data_large/pantip-large/*.jsonl')
len(fnames)

72

In [3]:
%%time

content_dfs = []
comment_dfs = []
for fname in tqdm(fnames):
    with jsonlines.open(fname) as reader:
        for obj in tqdm(reader):
            if 'comment' in obj.keys():
                d = {
                    'datasource_url': obj['datasource_url'],
                    'content_id': obj['content_id'],
                    'comment_id': obj['comment_id'],
                    'order_comment': obj['order_comment'],
                    'type':obj['type'],
                    'comment': obj['comment'],
                    'like_score': obj['like_score'],
                    'feel_heart': obj['feel_heart'],
                    'feel_laugh': obj['feel_laugh'],
                    'feel_love': obj['feel_love'],
                    'feel_sad': obj['feel_sad'],
                    'feel_horror': obj['feel_horror'],
                    'feel_wow': obj['feel_wow'],
                }
                comment_dfs.append(d)
            elif ('content' in obj.keys()) and (obj['total_field'] > 1):
                d = {
                    'datasource_url': obj['datasource_url'],
                    'content_id': obj['content_id'],
                    'type':obj['type'],
                    'title': obj['title'],
                    'body': obj['content'],
                    'total_field': obj['total_field'],
                    'tags': obj['tags'],
                    'like_score': obj['like_score'],
                    'feel_heart': obj['feel_heart'],
                    'feel_laugh': obj['feel_laugh'],
                    'feel_love': obj['feel_love'],
                    'feel_sad': obj['feel_sad'],
                    'feel_horror': obj['feel_horror'],
                    'feel_wow': obj['feel_wow'],
                }
                content_dfs.append(d)    

comment_df = pd.DataFrame(comment_dfs)
content_df = pd.DataFrame(content_dfs)

In [4]:
content_df = content_df.drop_duplicates()
#remove threads with different snapshots
content_df['rnk'] = content_df.sort_values(['total_field',],ascending=[False]) \
             .groupby(['content_id']) \
             .cumcount() + 1
content_df = content_df[content_df.rnk==1]
content_df.to_csv('../data_large/content_df.csv', index=None)


comment_df = comment_df.drop_duplicates()
comment_df['rnk'] = comment_df.groupby(['content_id','comment_id']).cumcount() + 1
comment_df = comment_df[comment_df.rnk==1]
comment_df.to_csv('../data_large/comment_df.csv', index=None)


comment_df = pd.read_csv('../data_large/comment_df.csv')
content_df = pd.read_csv('../data_large/content_df.csv')

In [5]:
#check if content deduplicated successfully
content_df.shape, content_df.content_id.nunique(), content_df.drop_duplicates().shape

In [None]:
#check if comment deduplicated successfully
comment_df.shape, comment_df.comment_id.nunique(), comment_df.drop_duplicates().shape

In [74]:
all_df = content_df.merge(comment_df, on='content_id')
all_df = all_df.drop(['rnk_x','rnk_y','datasource_url_y'],1)
filter out later sub-comments
all_df = all_df[all_df.order_comment.map(lambda x: '-' not in str(x))]
create interact count
all_df['nb_interact'] = all_df[['like_score_y',
       'feel_heart_y', 'feel_laugh_y', 'feel_love_y', 'feel_sad_y',
       'feel_horror_y', 'feel_wow_y']].sum(1)

only keep top 5 comments with highest interacts
all_df['rnk'] = all_df.sort_values(['nb_interact','like_score_y','feel_heart_y'], \
             ascending=False) \
             .groupby(['content_id']) \
             .cumcount() + 1
all_df = all_df[all_df.rnk<=5].drop('rnk',1)

#filter less than 2 comment
all_df = all_df[all_df.total_field>1]

#pick only question in title
question_signals = ['ใคร',
                    'ทำไม',
                    'อะไร',
                    'ไหน',
                    'ไหม','มั้ย',
                    'เท่าไร','ไหร่','กี่',
                    'อย่างไร','ยังไง',
                    'หรือ','หรอ','เรอะ']
all_df = all_df[all_df.title.map(lambda x: sum(i in x for i in question_signals)>0)]

#remove edit artifacts
all_df['title'] = all_df.title.map(lambda x: x.split('แก้ไขข้อความเมื่อ')[0])
all_df['body'] = all_df.body.map(lambda x: x.split('แก้ไขข้อความเมื่อ')[0])
all_df['comment'] = all_df.comment.map(lambda x: x.split('แก้ไขข้อความเมื่อ')[0])

#replace URL
import re
def replace_url(text): return re.sub(r'http\S+|www\S+', '[URL]', text)

all_df['title'] = all_df.title.map(replace_url)
all_df['body'] = all_df.body.map(replace_url)
all_df['comment'] = all_df.comment.map(replace_url)

#remove spoiler tags
all_df['body'] = all_df.body.map(lambda x: str(x).replace('[Spoil] คลิกเพื่อดูข้อความที่ซ่อนไว้',''))
all_df['comment'] = all_df.comment.map(lambda x: str(x).replace('[Spoil] คลิกเพื่อดูข้อความที่ซ่อนไว้',''))

#drop na
all_df = all_df.dropna()

count tokens
tokenizer_mgpt = AutoTokenizer.from_pretrained('sberbank-ai/mGPT')

#title
title_tokens = tokenizer_mgpt(all_df.title.tolist())
all_df['title_tokens'] = [len(i) for i in title_tokens.input_ids]

#body
batch_size = 100_000
toks = []
for i in tqdm(range(0, len(all_df), batch_size)):
    body_tokens = tokenizer_mgpt(all_df.body.tolist()[i:i+batch_size])
    toks+= [len(i) for i in body_tokens.input_ids]
all_df['body_tokens'] = toks

#comment
batch_size = 100_000
toks = []
for i in tqdm(range(0, len(all_df), batch_size)):
    comment_tokens = tokenizer_mgpt(all_df.comment.tolist()[i:i+batch_size])
    toks+= [len(i) for i in comment_tokens.input_ids]
all_df['comment_tokens'] = toks

#filter comments
#min tok 12 (10th percentile) to filter out ขอบคุณ
#max tok 512 (97th percentile) to filter out too detailed answers
all_df = all_df[(all_df.comment_tokens>=12)&(all_df.comment_tokens<=512)]

#filter title
#min tok 16 (5th percentile) to filter out super short titles e.g. ทำไม?
#max tok 75 (99th percentile) to filter out too long titles with too many special characters/emojis
all_df = all_df[(all_df.title_tokens>=12)&(all_df.title_tokens<=75)]

#filter prompt
#min tok 48 (5th percentile) to filter out questions that are too random
#max tok 512 (90th percentile) just to fit 1024 prompt+demo
all_df = all_df[(all_df.prompt_tokens>=48)&(all_df.prompt_tokens<=512)]

#filter out thread with less than 2 comments
more_than_one = all_df.content_id.value_counts().reset_index()
more_than_one = more_than_one[more_than_one.content_id>1]
more_than_one.columns = ['content_id','nb_comments']
all_df = all_df.merge(more_than_one,on='content_id',how='inner')

#sort by nb_interact then comment_tokens; topmost has label 1, otherwise 0
all_df['rnk'] = all_df.sort_values(['nb_interact','comment_tokens'], ascending=[False,False]) \
             .groupby(['content_id']) \
             .cumcount() + 1
all_df['preference_label'] = all_df['rnk'].map(lambda x: 1 if x==1 else 0)

#add strict labels for only threads that top comments indeed have more interacts (not just longer, no ties)
rank_1 = all_df[all_df.rnk==1][['content_id','title','body','comment','nb_interact']]
rank_2 = all_df[all_df.rnk==2][['content_id','comment','nb_interact']]
rank_combined = rank_1.merge(rank_2, on='content_id')
rank_combined['strict'] = 1
rank_combined = rank_combined[rank_combined.nb_interact_x>rank_combined.nb_interact_y]
rank_combined = rank_combined[['content_id','strict']]
all_df = all_df.merge(rank_combined, on='content_id',how='left')
all_df['strict'] = all_df.strict.fillna(0)

all_df.to_csv('../data_large/all_df.csv', index=None)
all_df.shape, all_df.content_id.nunique()

((3510036, 34), 935910)

In [81]:
#load from save
all_df = pd.read_csv('../data_large/all_df.csv')

In [82]:
all_df.shape, all_df.content_id.nunique()

((3510036, 34), 935910)

## Package as Dataset

In [327]:
#save to jsonl
import json

with open('../data_large/php_json/php_universe.jsonl', 'w', encoding='utf-8') as f:
    for content_id in tqdm(all_df.content_id.unique()):
        d = all_df[all_df.content_id==content_id].sort_values('preference_label', ascending=False)
        thread_d = d[[
        'content_id','tags','nb_comments',
        'title_tokens','body_tokens','title','body','strict'
        ]].drop_duplicates().to_dict(orient='records')[0]
        comments = []
        for i,row in d.iterrows():
            comment_d = row[[
                'order_comment',
                'comment_tokens',
                'comment',
                'preference_label',
                'nb_interact']].to_dict()
            comments.append(comment_d)
        thread_d['comments'] = comments
        
        #write to jsonl
        json.dump(thread_d, f, ensure_ascii=False)
        f.write('\n')

  0%|          | 0/935910 [00:00<?, ?it/s]

In [116]:
#load from jsonl
php_list = []
with open('../data_large/php_json/php_data.jsonl', 'r', encoding='utf-8') as file:
    for line in tqdm(file):
        item = json.loads(line.strip())
        item['body'] = '' if item['body']!=item['body'] else item['body']
        php_list.append(item)
len(php_list)

0it [00:00, ?it/s]

935910

In [117]:
#create dataset
ds = Dataset.from_list(php_list)
ds

Dataset({
    features: ['content_id', 'tags', 'nb_comments', 'title_tokens', 'body_tokens', 'title', 'body', 'strict', 'comments'],
    num_rows: 935910
})

In [138]:
#push to hub
ds.push_to_hub("pythainlp/php", private=True)

In [139]:
#load from hub
from datasets import load_dataset
ds = load_dataset("pythainlp/php")
ds

Downloading readme:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

Using custom data configuration pythainlp--php-62efb1c37afc4be5


Downloading and preparing dataset parquet/pythainlp--php to /home/charipol/.cache/huggingface/datasets/pythainlp___parquet/pythainlp--php-62efb1c37afc4be5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/172M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/174M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/160M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/154M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /home/charipol/.cache/huggingface/datasets/pythainlp___parquet/pythainlp--php-62efb1c37afc4be5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['content_id', 'tags', 'nb_comments', 'title_tokens', 'body_tokens', 'title', 'body', 'strict', 'comments'],
        num_rows: 935910
    })
})

In [142]:
ds['train'][0]

{'content_id': 36188003,
 'tags': 'Social Network,Twitter',
 'nb_comments': 4,
 'title_tokens': 20,
 'body_tokens': 66,
 'title': 'ส่งDMในทวิตไม่ได้ ทำยังไงดี?',
 'body': 'DMในทวิตไม่ได้ค่ะ พอส่งมันก็ขึ้นว่า การดำเนินการนี้ได้รับการรายงานว่าน่าสงสัย  ต้องทำยังไงคะช่วยบอกที???',
 'strict': 0.0,
 'comments': [{'comment': 'จขกท.แก้ได้ยังค่ะ เราก็เปน ฝากด้วยนะคะ(แต่เราส่งไม่ได้แค่แอคเดียว นอกนั้นปกติค่ะ)',
   'comment_tokens': 55,
   'nb_interact': 0,
   'order_comment': 'ความคิดเห็นที่ 3',
   'preference_label': 1},
  {'comment': 'ส่งหาใครคะ ดีเอ็มต้องโฟโลทั้งสองฝ่ายปะอะ',
   'comment_tokens': 28,
   'nb_interact': 0,
   'order_comment': 'ความคิดเห็นที่ 1',
   'preference_label': 0},
  {'comment': 'ไม่ต้อง follow กันและกัน ก็ส่งได้นะส่งมาแล้วแต่ถ้าอีกฝ่าย block คุณยังไงก็ส่งไม่ได้',
   'comment_tokens': 52,
   'nb_interact': 0,
   'order_comment': 'ความคิดเห็นที่ 2',
   'preference_label': 0},
  {'comment': 'แก้ได้ยังคะ?เราก็เป็น',
   'comment_tokens': 15,
   'nb_interact': 0,
   'order_c