## Project for rmrb - Part #[1]

### Corpus construction, text cleaning and filtering

In [1]:
import re
import zipfile
import os
import sys
import timeit
import pandas as pd
import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
# need to unpack the 7z files before proceeding
user = input("Who is using the notebook? ")
if user == "Tim":
    corpus_name = "/Users/timqzhang/Desktop/UChicago/MACSS_Spring_2020/content_local/rmrb/7z"
elif user == "Linghui":
    corpus_name = "/Users/linghuiwu/uchicago/courseworks/soci40133/rmrb/7z"
elif user == "Minghao":
    corpus_name = "to be filled"

Who is using the notebook? Linghui


In [3]:
# define the main func for corpus load and processing


def loadcorpus_plus_cleaning(corpus_name):

    texts_raw = {'year': [], 'month': [], 'date': [],
                 'issue': [], 'zhuan_lan': [], 'title': [],
                 'author': [], 'text': []}

    count = 0
    folders = sorted(os.listdir(corpus_name + "/"))
    # mac os may have error for DS.Store file, windows os may not need to cut
    # first folder
    for folder in folders[1:]:
        if '7z' not in folder:
            count += 1
            print('Current running on {}, total: {}/692'.format(folder,
                                                                count), end='\r', flush=True)
            #os.system('cd '+corpus_name+'/'+folder)
            #os.system('find . -name "*.DS_Store" -type f -delete')

            for file in os.listdir(corpus_name + "/" + folder + '/'):
                if 'md' in file:

                    # load in raw text

                    news_piece = []
                    news_loc = corpus_name + "/" + folder + '/'
                    f = open(news_loc + file)
                    for line in f:
                        news_piece.append(line)

                    # fill the info this news piece with cleaning

                    # dates
                    texts_raw['year'].append(str(folder)[:4])
                    texts_raw['month'].append(str(folder)[5:7])
                    texts_raw['date'].append(re.sub('\n', '', news_piece[2]))

                    # issue (第x版)
                    # some issues have extended infomation, just keep them here
                    issue_raw = re.sub('\n', '', news_piece[3])
                    if '第版' in issue_raw:
                        texts_raw['issue'].append('NA')
                    else:
                        if issue_raw.index(')') - issue_raw.index('(') == 1:
                            texts_raw['issue'].append(
                                str(re.search(r'\d+', issue_raw).group(0)))
                        else:
                            temp = issue_raw.index('(')
                            texts_raw['issue'].append(punctuation_transfer(
                                str(re.search(r'\d+', issue_raw).group(0)) + '_' + issue_raw[temp + 1:-1])[0])

                    # zhuan_lan
                    clean_zhuanlan = re.sub('\n', '', news_piece[4])[3:]
                    if clean_zhuanlan.strip() == '':
                        texts_raw['zhuan_lan'].append('NA')
                    else:
                        texts_raw['zhuan_lan'].append(
                            punctuation_transfer(clean_zhuanlan)[0])

                    # title
                    clean_title_list = re.sub(
                        '\n', '', news_piece[0])[4:].split()
                    clean_title_list = list(
                        filter(lambda x: x != '', clean_title_list))
                    clean_title = '_'.join(clean_title_list)
                    texts_raw['title'].append(
                        punctuation_transfer(clean_title)[0])

                    # authors
                    author_str = ''
                    author_list = []
                    if news_piece[1] == '\n':
                        author_str = 'NA'
                    else:
                        author_list = re.sub('\n', '', news_piece[1]).split()
                        author_str = '_'.join(author_list)

                    texts_raw['author'].append(
                        punctuation_transfer(author_str)[0])

                    # texts
                    # see text_cleaning func for specific cleaning criteria
                    author_title_list = [clean_title_list, author_list]
                    cleaned_text = text_cleaning(
                        author_title_list, news_piece[6:])
                    if cleaned_text[1] == True:
                        texts_raw['text'].append(
                            punctuation_transfer(cleaned_text[0]))
                    else:
                        # if return empty text, then this news piece is
                        # trivial, just delete it
                        del texts_raw['year'][-1]
                        del texts_raw['month'][-1]
                        del texts_raw['date'][-1]
                        del texts_raw['issue'][-1]
                        del texts_raw['zhuan_lan'][-1]
                        del texts_raw['title'][-1]
                        del texts_raw['author'][-1]

    return texts_raw

In [4]:
# define the func for text body cleaning


def text_cleaning(lists, text):

    clean_text = [re.sub('\n', '', x).strip() for x in text if x != '\n']
    title, authors = lists

    # clean overlap title
    for title_piece in title:
        for sent in clean_text[:10]:
            if title_piece in sent and len(title_piece) + 1 > len(sent):
                clean_text[clean_text.index(sent)] = sent[
                    sent.index(title_piece) + len(title_piece):]
            elif sent in title_piece:
                clean_text[clean_text.index(sent)] = ''

    clean_text = list(filter(lambda x: x != '', clean_text))

    # if after cleaning the list goes empty, then return False to show this
    # news is trivial (same below)
    if len(clean_text) == 0:
        return clean_text, False

    # clean overlap authors at the beginning
    for author_piece in authors:
        if author_piece in clean_text[0]:
            clean_text[0] = clean_text[0][clean_text[0].index(
                author_piece) + len(author_piece) + 1:]

    if len(clean_text) == 0:
        return clean_text, False

    # clean overlap authors at the end
    for author_piece in authors:
        if author_piece in clean_text[-1]:
            ind = clean_text[-1].rfind('.')
            if ind == -1:
                clean_text = clean_text[:-1]
                break
            else:
                clean_text[-1] = clean_text[-1][:ind + 1]

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning ‘【】’,like【新华社电讯】,etc.
    if '【' in clean_text[0][:20]:
        clean_text[0] = clean_text[0][clean_text[0].index('】') + 1:]
    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning the irrelevant parathesis at the beginning
    if '（' in clean_text[0][:25] and len(clean_text) > 1:
        if '）' in clean_text[0]:
            clean_text[0] = clean_text[0][clean_text[0].index('）') + 1:]
        elif '）' in clean_text[1] and '（' not in clean_text[0]:
            clean_text = clean_text[1:]
            clean_text[0] = clean_text[0][clean_text[0].index('）') + 1:]
    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning the irrelevant colons at the beginning like “新华社报讯:”
    if '：' in clean_text[0][:20]:
        clean_text[0] = clean_text[0][clean_text[0].index('：') + 1:]
    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning the irrelevant info at the beginning like “新华社电”
    if '新华社' in clean_text[0][:50]:
        if '电' in clean_text[0][:50]:
            clean_text[0] = clean_text[0][clean_text[0].index('电') + 1:]
        elif '讯' in clean_text[0][:50]:
            clean_text[0] = clean_text[0][clean_text[0].index('讯') + 1:]

    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning the irrelevant info at the beginning like “本报讯”
    if '本报讯' in clean_text[0][:50]:
        clean_text[0] = clean_text[0][clean_text[0].index('讯') + 1:]
    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning the irrelevant info at the end like “新华社稿”
    if '（' in clean_text[-1][-15:]:
        clean_text[-1] = clean_text[-1][:clean_text[-1].index('（')]
    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # cleaning the irrelevant info at the end like “新华社摄”
    if '摄' in clean_text[-1][-20:] and '记者' in clean_text[-1][-20:]:
        ind = clean_text[-1].rfind('.')
        if ind == -1:
            clean_text = clean_text[:-1]
        else:
            clean_text[-1] = clean_text[-1][:ind + 1]
    clean_text = list(filter(lambda x: x != '', clean_text))

    if len(clean_text) == 0:
        return clean_text, False

    # transfer the punctuations

    clean_text = punctuation_transfer(clean_text)
    clean_text = list(filter(lambda x: x != '', clean_text))
    if len(clean_text) == 0:
        return clean_text, False

    return clean_text, True

In [5]:
# define the func for cleaning '\u3000' and transfer chinese punctuations
# to english ones


def punctuation_transfer(raw_texts):

    # if not necessary, just skip this part

    if isinstance(raw_texts, str):
        texts = [raw_texts]
    elif isinstance(raw_texts, list):
        texts = raw_texts

    for i, sent in enumerate(texts):
        if '\u3000' in sent:
            texts[i] = texts[i].replace('\u3000', '')

        if '。' in sent:
            texts[i] = texts[i].replace('。', '.')
        if '，' in sent:
            texts[i] = texts[i].replace('，', ',')
        if '、' in sent:
            texts[i] = texts[i].replace('、', '\\')

        if '：' in sent:
            texts[i] = texts[i].replace('：', ':')
        if '；' in sent:
            texts[i] = texts[i].replace('；', ';')

        if '（' in sent:
            texts[i] = texts[i].replace('（', '(')
        if '）' in sent:
            texts[i] = texts[i].replace('）', ')')

        if '“' in sent:
            texts[i] = texts[i].replace('“', '"')
        if '”' in sent:
            texts[i] = texts[i].replace('”', '"')

        if '？' in sent:
            texts[i] = texts[i].replace('？', '?')
        if '！' in sent:
            texts[i] = texts[i].replace('！', '!')

        if '——' in sent:
            texts[i] = texts[i].replace('——', '-')

        texts[i] = texts[i].strip().lstrip(string.punctuation)

    return texts

In [6]:
start_time = timeit.default_timer()

news_raw = loadcorpus_plus_cleaning(corpus_name)

running_time = timeit.default_timer() - start_time

if running_time < 60:
    print('\nTotal time used: ', running_time, 's')
else:
    print('\nTotal time used: ', running_time //
          60, 'min', running_time % 60, 's')

Current running on 2003年12月, total: 692/692
Total time used:  11.0 min 35.992948128999956 s


In [7]:
# transfer to df

news_df = pd.DataFrame(news_raw)
news_df_sort = news_df.sort_values(['date', 'issue'])
news_df_sort

Unnamed: 0,year,month,date,issue,zhuan_lan,title,author,text
3,1946,05,1946-05-15,1,,沁县阎军特务横行_殴劫我联络组人员_我已向东沁线小组提出抗议,,"[白晋线阎军伪造所谓""控诉共党罪行"",已在沁县城内演出殴打中共联络组人员事件.四月三十日晚,..."
73,1946,05,1946-05-15,1,,周徐白三氏飞南京_光山小组将视察中原解放区,,"[中共代表周恩来\美方代表白鲁德\及徐永昌代表王天鸣等一行,九日下午由宣化店返抵汉口后,周\..."
112,1946,05,1946-05-15,1,,发刊词,,"[本报-人民日报,晋冀鲁豫边区广大人民的报纸出版了., 晋冀鲁豫边区的人民\八路军\共产党在..."
136,1946,05,1946-05-15,1,,平汉川陕道上国民党军运忙,,"[平汉路上军车辚辚,日前国民党军由新乡运抵辉县坦克十六辆重炮五门.晋豫地区国民党军,正积极进..."
153,1946,05,1946-05-15,1,,国民党军结合敌伪高唱备战_大举调动屡犯我边区_三个半月大小进攻九百余次,,"[国民党军结合敌伪军,高唱""停战就是备战"",""整军就是建军"",""复员就是动员"",乘八路军忠实..."
...,...,...,...,...,...,...,...,...
1270917,2003,12,2003-12-31,9_文化新闻,文事评点,对租书摊的管理不容忽视,陈晓东,"[现在有许多租书摊点遍布城镇的大街小巷,租书摊的图书一般品种比较多,价格也不贵,花一两角钱就..."
1270992,2003,12,2003-12-31,9_文化新闻,,给孩子一片绿色文化的天地-中央电视台少儿频道开播小记,向兵,"[这里是中央电视台少儿频道.""2003年12月28日清晨6时,随着人们早已熟悉的中央电视台开..."
1271350,2003,12,2003-12-31,9_文化新闻,文化在线,"安塞举办""陕北过大年""摄影活动",师银笙,"[由中国艺术摄影协会,安塞县委\县政府主办的""陕北过大年""摄影创作活动将于2004年正月十二..."
1271648,2003,12,2003-12-31,9_文化新闻,文化在线,精品剧目冠名授牌仪式在京举行,陈斯,"[国家舞台艺术精品工程2002—2003年度十大精品剧目和20台精品提名剧目的冠名授牌仪式,..."


In [8]:
# random check on df

news_df_sort[::150000]

Unnamed: 0,year,month,date,issue,zhuan_lan,title,author,text
3,1946,5,1946-05-15,1,,沁县阎军特务横行_殴劫我联络组人员_我已向东沁线小组提出抗议,,"[白晋线阎军伪造所谓""控诉共党罪行"",已在沁县城内演出殴打中共联络组人员事件.四月三十日晚,..."
150676,1956,11,1956-11-10,2,,要求英法侵略者立即撤出埃及领土,,"[全国各地人民,继续进行各种活动热烈支援埃及反抗英法侵略,要求侵略者立即撤出埃及领土.正在北..."
300286,1963,9,1963-09-12,3,,应邀出席世界科协北京中心会议_古巴两客人抵京,,"[应中华人民共和国科学技术协会和世界科协北京中心邀请,来北京参加世界科协北京中心第一次科学讨..."
450708,1975,10,1975-10-06,4,,扎伊尔总统蒙博托的贺电,,"[扎伊尔总统蒙博托的贺电中国共产党中央委员会主席毛泽东先生阁下主席先生:, 在英勇的中国人民..."
599720,1983,4,1983-04-18,1,,"赵总理离新西兰抵澳大利亚访问_到堪培拉时,霍克总理到机场迎接并设宴欢迎_离奥克兰时,马尔登总...",,"[中国国务院总理赵紫阳应澳大利亚总理霍克的邀请,今天下午抵达澳大利亚首都堪培拉,开始对澳大利..."
750049,1987,12,1987-12-15,6_国际,,法国总理希拉克主张树立欧洲防务思想_卡卢奇说中导条约不会使欧美防务脱钩,,"[法国总理希拉克主张树立欧洲防务思想 卡卢奇说中导条约不会使欧美防务脱钩, 新华社巴黎12..."
901398,1992,9,1992-09-14,3_教育·科技·文化,大学生暑假见闻征文,向西的突破,康新明,"[走在乌鲁木齐市红山市场,不时发现有些身材高大\留着卷曲胡须的独联体游客在兜售他们的商品.从..."
1050474,1997,4,1997-04-02,10_文化,,"电脑设计忽如一夜春风来图书装帧面临""换笔""",李桂杰,"[流连于大大小小的书店或书摊,稍稍留意,你会发现近一两年出版的书籍在装帧设计上比过去要精美漂..."
1199874,2001,12,2001-12-27,4_要闻,,"《极限,在这里延伸》研讨会举行",文宜,"[本报北京12月26日讯本报12月15日发表的报告文学《极限,在这里延伸》,对湖北省武汉市江..."


In [9]:
# sample output the corpus to csv
if user == "Tim":
    saving_path = "/Users/timqzhang/Desktop/UChicago/MACSS_Spring_2020/content_local/rmrb/"
elif user == "Linghui":
    saving_path = "/Users/linghuiwu/uchicago/courseworks/soci40133/rmrb/"
elif user == "Minghao":
    saving_path = "to be filled"

news_df_sort[news_df_sort['year']=='1984'][22000:24000].to_csv(saving_path + "rmrb_corpus.csv")

Who is using the notebook? Linghui


In [10]:
# sample filtering (pic news here)

news_df_filter=news_df_sort[~news_df_sort.title.str.contains('(图片)')==True]

In [11]:
news_df_filter

Unnamed: 0,year,month,date,issue,zhuan_lan,title,author,text
3,1946,05,1946-05-15,1,,沁县阎军特务横行_殴劫我联络组人员_我已向东沁线小组提出抗议,,"[白晋线阎军伪造所谓""控诉共党罪行"",已在沁县城内演出殴打中共联络组人员事件.四月三十日晚,..."
73,1946,05,1946-05-15,1,,周徐白三氏飞南京_光山小组将视察中原解放区,,"[中共代表周恩来\美方代表白鲁德\及徐永昌代表王天鸣等一行,九日下午由宣化店返抵汉口后,周\..."
112,1946,05,1946-05-15,1,,发刊词,,"[本报-人民日报,晋冀鲁豫边区广大人民的报纸出版了., 晋冀鲁豫边区的人民\八路军\共产党在..."
136,1946,05,1946-05-15,1,,平汉川陕道上国民党军运忙,,"[平汉路上军车辚辚,日前国民党军由新乡运抵辉县坦克十六辆重炮五门.晋豫地区国民党军,正积极进..."
153,1946,05,1946-05-15,1,,国民党军结合敌伪高唱备战_大举调动屡犯我边区_三个半月大小进攻九百余次,,"[国民党军结合敌伪军,高唱""停战就是备战"",""整军就是建军"",""复员就是动员"",乘八路军忠实..."
...,...,...,...,...,...,...,...,...
1270917,2003,12,2003-12-31,9_文化新闻,文事评点,对租书摊的管理不容忽视,陈晓东,"[现在有许多租书摊点遍布城镇的大街小巷,租书摊的图书一般品种比较多,价格也不贵,花一两角钱就..."
1270992,2003,12,2003-12-31,9_文化新闻,,给孩子一片绿色文化的天地-中央电视台少儿频道开播小记,向兵,"[这里是中央电视台少儿频道.""2003年12月28日清晨6时,随着人们早已熟悉的中央电视台开..."
1271350,2003,12,2003-12-31,9_文化新闻,文化在线,"安塞举办""陕北过大年""摄影活动",师银笙,"[由中国艺术摄影协会,安塞县委\县政府主办的""陕北过大年""摄影创作活动将于2004年正月十二..."
1271648,2003,12,2003-12-31,9_文化新闻,文化在线,精品剧目冠名授牌仪式在京举行,陈斯,"[国家舞台艺术精品工程2002—2003年度十大精品剧目和20台精品提名剧目的冠名授牌仪式,..."


In [12]:
# Save the whole corpus for further analysis
if user == "Linghui":
    news_df_filter.to_csv("rmrb.csv")

Who is using the notebook? Linghui
