# 数据预处理

    1.将data下的原始二进制文件转化为UTF-8编码的csv格式文件。
    2.从CSV文件中将三个属性的label提取出来，做成单独的文件。
    3.调用jieba分词，完成分词和词性过滤。
    
参考github:https://github.com/coderSkyChen/2016CCF_BDCI_Sougou

In [18]:
import csv
import pandas as pd

In [19]:
# 原始的二进制训练集和测试集文件
row_train = '../data/user_tag_query.10W.TRAIN'
row_test = '../data/user_tag_query.10W.TEST'

In [20]:
def get_csvfile_from_rowdata(rowdata_path, is_test=False):
    """ 将原始的二进制文件转化为csv格式文件，并保存csv文件在二进制文件当前数据目录 """
    csvfile_path = open(rowdata_path + '.csv', 'w')
    writer = csv.writer(csvfile_path)
    if is_test:
        writer.writerow(['ID', 'QueryList'])
    else:
        writer.writerow(['ID', 'age', 'Gender', 'Education', 'QueryList'])
    
    # 以GB18030编码读取二进制文件
    with open(rowdata_path, 'r', encoding='GB18030', errors='ignore') as f:
        for line in f:
            line.strip()
            data = line.split('\t')
            if is_test:
                writedata = [data[0]]
            else:
                writedata = [data[0], data[1], data[2], data[3]]
            querystr = ''
            for d in data[4:]:
                try:
                    querystr += d + '\t'
                except:
                    print(data[0], querystr)

            querystr = querystr[:-1]
            writedata.append(querystr)
            writer.writerow(writedata)

In [21]:
# 1.将data下的原始二进制文件转化为UTF-8编码的csv格式文件。
get_csvfile_from_rowdata(row_train)
get_csvfile_from_rowdata(row_test, is_test=True)

In [22]:
# 读取csv文件
train = pd.read_csv('../data/user_tag_query.10W.TRAIN.csv')
test = pd.read_csv('../data/user_tag_query.10W.TEST.csv')

In [26]:
print('train.shape:',train.shape)
train.head()

train.shape (100000, 5)


Unnamed: 0,ID,age,Gender,Education,QueryList
0,22DD920316420BE2DF8D6EE651BA174B,1,1,4,柔和双沟\t女生\t中财网首页 财经\thttp://pan.baidu.com/s/1pl...
1,43CC3AF5A8D6430A3B572337A889AFE4,2,1,3,"广州厨宝烤箱\t世情薄,人情恶,雨送黄昏花易落,晓风干,泪痕\t厦门酒店用品批发市场\t我只..."
2,E97654BFF5570E2CCD433EA6128EAC19,4,1,0,钻石之泪耳机\t盘锦到沈阳\t旅顺公交\t辽宁阜新车牌\tbaidu\tk715\tk716...
3,6931EFC26D229CCFCEA125D3F3C21E57,4,2,3,最受欢迎狗狗排行榜\t舶怎么读\t场景描 写范例\t三维绘图软件\t枣和酸奶能一起吃吗\t好...
4,E780470C3BB0D340334BD08CDCC3C71A,2,2,4,干槽症能自愈吗\t太太万岁叶舒心去没去美国\t干槽症\t右眼皮下面一直跳是怎么回事\t麦当劳...


In [28]:
print('test.shape:',test.shape)
test.head()

test.shape: (100000, 2)


Unnamed: 0,ID,QueryList
0,ED89D43B9F602F96D96C25255F7C228C,谁唱的味道好听\t吻戏是真吻还是假吻\t搞笑的电视剧排行榜\t陈学冬身高\t中南大学\t南京...
1,83C3B7B4AAF8074655A8079F561A76D6,马克思主义基本原理概论\t康世恩的子女\t航班动态实时查询\t上海地铁几点开始运行\t当归\...
2,CA9F675A024FB2353849350A35CF8B0F,英雄联盟之电竞称王\t手机怎么扫描手机上的二维码\t重庆重钢老板\t资阳俊士\t2016lp...
3,DE45B5C4E57AAEBCF3FDFA2A774093BF,传统钓\t3号鱼钩\t鲫鱼汤的做法大全\t鱼饵直销\t怎么戒烟最有效\t钓鱼技巧\t浮钓\t...
4,406A681FB3DF81EC0E561796AE50AE50,胜利油田属于中石化还是中石油\t苏珊米勒狮子座年运\t怼是什么意思\t13981972217...


In [29]:
# 2.从CSV文件中将三个属性的label提取出来，做成单独的文件。
train.age.to_csv('../data/train_age.csv', index=False)
train.Gender.to_csv('../data/train_gender.csv', index=False)
train.Education.to_csv('../data/train_education.csv', index=False)
train.QueryList.to_csv('../data/train_querylist.csv', index=False)

test.QueryList.to_csv('../data/test_querylist.csv', index=False)

In [30]:
# 3.调用jieba分词，完成分词和词性过滤。
import jieba.analyse
import time
import jieba
import jieba.posseg
import os, sys

In [31]:
def get_query_list(file_path):
    data_list = []
    with open(file_path, 'r') as f:
        line = f.readline()
        count = 0
        while line:
            try:
                if line != '"\n':
                    data_list.append(line)
                    count += 1
            except:
                print('error', line, count)
            line = f.readline()
    print('共得到query_list: {}行'.format(count))
    return data_list                

In [32]:
train_query_list = get_query_list('../data/train_querylist.csv')
test_query_list = get_query_list('../data/test_querylist.csv')

共得到query_list: 100000行
共得到query_list: 100000行


In [35]:
len(train_query_list)

100000

In [36]:
def save_seg_words_to_csvfile(query_list, csvfile_path, allowPOS = ['n','v','j']):
    """ 使用带有词性的精确分词模式对句子进行分词 """
    csvfile = open(csvfile_path, 'w')
    jieba.enable_parallel() # 并行分词模式
    POS = {}
    for i in range(len(query_list)):
        s = []
        str = ""
        words = jieba.posseg.cut(query_list[i])
        for word, flag in words:
            POS[flag] = POS.get(flag, 0) + 1
            if (flag[0] in allowPOS) and len(word) >= 2:
                str += word + " "
        s.append(str)
        csvfile.write(" ".join(s) + '\n')
    csvfile.close()
    print(POS)

In [37]:
train_seg_words_path = '../data/train_seg_words.csv'
test_seg_words_path = '../data/test_seg_words.csv'

In [38]:
# 将训练集分词后的句子保存到csv文件中
st_time = time.time()
save_seg_words_to_csvfile(train_query_list, train_seg_words_path)
print('训练集分词共耗时: {}s'.format(time.time()-st_time))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/b3/5m0xjkvs3gz1x__vz8rrx7h80000gn/T/jieba.cache
Loading model cost 1.332 seconds.
Prefix dict has been built succesfully.
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/anaconda3/l

KeyboardInterrupt: 

In [None]:
# 将测试集分词后的句子保存到csv文件中
st_time = time.time()
save_seg_words_to_csvfile(test_query_list, test_seg_words_path)
print('测试集分词共耗时: {}s'.format(time.time()-st_time))

In [47]:
# 训练集和测试集大小，有两个为空的行
train_segs.shape, test_segs.shape

((99998, 1), (99998, 1))

In [19]:
# 分词及查看词性的示例
'''
seg_list = jieba.cut('我才18岁，我是个小孩子', cut_all=False)
print(' '.join(seg_list))

words = jieba.posseg.cut('我才18岁，我是个小孩子')
for word, flag in words:
    print('%s %s' % (word, flag))
    print('---', flag[0], flag)
'''

"\nseg_list = jieba.cut('我才18岁，我是个小孩子', cut_all=False)\nprint(' '.join(seg_list))\n\nwords = jieba.posseg.cut('我才18岁，我是个小孩子')\nfor word, flag in words:\n    print('%s %s' % (word, flag))\n    print('---', flag[0], flag)\n"

In [96]:
train_segs = pd.read_csv(train_seg_words_path, names=['words'])
test_segs = pd.read_csv(test_seg_words_path, names=['words'])

In [98]:
# 将分词后的训练集和测试集进行合并
full_seg_words_path = '../data/full_seg_words.csv'

with open(full_seg_words_path, 'w') as wf:
    cnt = 0
    with open(train_seg_words_path, 'r') as f:
        for line in f:
            line.strip()
            wf.writelines(line)
            cnt += 1
    with open(test_seg_words_path, 'r') as f:
        for line in f:
            line.strip()
            wf.writelines(line)
            cnt += 1
    print('合并后的训练集测试集分词数据共: {}行'.format(cnt))

合并后的训练集测试集分词数据共: 200000行


In [99]:
'''
with open('../data/train_seg_words.csv', 'r') as f:
    cnt = 0
    for line in f:
        if cnt< 5:
            print(line)
        if line == '\n': # 有两行为空的行
            print('...', line, '...')
            print(cnt)
        cnt+=1
    print(cnt)
'''

"\nwith open('../data/train_seg_words.csv', 'r') as f:\n    cnt = 0\n    for line in f:\n        if cnt< 5:\n            print(line)\n        if line == '\n': # 有两行为空的行\n            print('...', line, '...')\n            print(cnt)\n        cnt+=1\n    print(cnt)\n"