# 文本处理

### 官方处理流程
```sh
python2 ./data/tools.py -m filter -s ./data/News_info_train.txt -t ./clean_data/News_info_train_filter.txt
python2 ./data/tools.py -m filter -s ./data/News_info_unlabel.txt -t ./clean_data/News_info_unlabel_filter.txt
python2 ./data/tools.py -m filter -s ./data/News_info_validate.txt -t ./clean_data/News_info_validate_filter.txt
python2 ./data/tools.py -m filter -s ./data/News_info_label_train_example100.txt -t ./clean_data/News_info_label_train_example100_filter.txt
```

# 数据预处理

- 解析HTML
- 过滤无效字符
- 生成文本
- 拼接数据


In [None]:
import re
import gensim
import jieba
import pandas as pd
from html.parser import HTMLParser

news_info_train = pd.read_csv('./clean_data/News_info_train_filter.txt', names=['id', 'text', 'Pics'], sep='\t', index_col='id')
news_pic_label_train = pd.read_csv('./data/News_pic_label_train.txt', names=['id', 'label', 'target_pic', 'target_text'], sep='\t', index_col='id')

news_info_train['Pics'].fillna('',inplace=True)
news_pic_label_train['target_pic'].fillna('',inplace=True)
news_pic_label_train['target_text'].fillna('',inplace=True)
news_info_train['text']=news_info_train['text'].apply(str.strip)
news_pic_label_train['target_text']=news_pic_label_train['target_text'].apply(str.strip)

news_info_unlabel = pd.read_csv('./clean_data/News_info_unlabel_filter.txt', names=['id', 'text', 'Pics'], sep='\t', index_col='id')
news_info_unlabel['Pics'] = news_info_unlabel['Pics'].fillna('')

news_info_train_example100 = pd.read_csv('./clean_data/News_info_train_example100_filter.txt', names=['id', 'text', 'Pics'], sep='\t', index_col='id')
news_pic_label_train_example100 = pd.read_csv('./data/News_pic_label_train_example100.txt', names=['id', 'label', 'target_pic', 'target_text'], sep='\t', index_col='id')

news_info_train_example100['Pics'].fillna('',inplace=True)
news_pic_label_train_example100['target_pic'].fillna('',inplace=True)
news_pic_label_train_example100['target_text'].fillna('',inplace=True)
news_info_train_example100['text']=news_info_train_example100['text'].apply(str.strip)
news_pic_label_train_example100['target_text']=news_pic_label_train_example100['target_text'].apply(str.strip)

news_info_train['text']=news_info_train['text'].apply(lambda x:' '.join(jieba.cut(x)))
news_pic_label_train['target_text']=news_pic_label_train['target_text'].apply(lambda x:' '.join(jieba.cut(x)))

news_info_unlabel['text']=news_info_unlabel['text'].apply(lambda x:' '.join(jieba.cut(x)))

news_info_train_example100['text']=news_info_train_example100['text'].apply(lambda x:' '.join(jieba.cut(x)))
news_pic_label_train_example100['target_text']=news_pic_label_train_example100['target_text'].apply(lambda x:' '.join(jieba.cut(x)))

news_info_train = news_info_train.join(news_pic_label_train)
news_info_train_example100 = news_info_train_example100.join(news_pic_label_train_example100)

news_info_validate = pd.read_csv('./clean_data/News_info_validate_filter.txt', names=['id', 'text', 'Pics'], sep='\t', index_col='id')
news_info_validate['Pics'].fillna('',inplace=True)
news_info_validate['text']=news_info_validate['text'].apply(str.strip)
news_info_validate['text']=news_info_validate['text'].apply(lambda x:' '.join(jieba.cut(x)))

news_info_train=news_info_train.reset_index()
news_info_unlabel=news_info_unlabel.reset_index()
news_info_train_example100=news_info_train_example100.reset_index()

news_info_train.drop('id',axis=1,inplace=True)
news_info_unlabel.drop('id',axis=1,inplace=True)
news_info_train_example100.drop('id',axis=1,inplace=True)

In [None]:
news_info_train.to_hdf('./clean_data/data.hdf5', 'news_info_train')
news_info_unlabel.to_hdf('./clean_data/data.hdf5', 'news_info_unlabel')
news_info_train_example100.to_hdf('./clean_data/data.hdf5', 'news_info_train_example100')
news_info_validate.to_hdf('./clean_data/data.hdf5', 'news_info_validate')




### 生成语料库

从label和unlabel的数据中抽取所有文本

In [59]:
pat=re.compile(r'( )+')
texts=pd.concat([news_info_train['text'],news_info_unlabel['text']])
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import mean_squared_error
sentences = [ pat.sub(' ', r).split(' ') for _, r in texts.iteritems()]
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=31)
model.save('./model/word2vec.m')

In [64]:
total, bad = 0,0

for i,r in news_info_validate['text'].iteritems():
    words = r.strip().split(' ')
    for w in words:
        total+=1
        try:
            w = w.strip()
            if len(w) > 0:
                model.wv[w]
        except Exception as e:
            bad+=1
print('语料库中没有的词: ',(0.0+bad)/total)

语料库中没有的词:  0.016734317277791382


### 语句词向量测试

In [None]:
roll_text = news_info_train['text'].sample(n=1).values[0]
for r in roll_text.strip().split(' '):
    if len(r) > 0:
        print(model.wv[r])

In [144]:
print(mean_squared_error(model.wv['菜鸡'],model.wv['盒子']))

2.16633


In [153]:
news_info_train_sample = news_info_train.sample(frac=1)
with open('./clean_data/trainset.txt','w') as f:
    for i,r in news_info_train_sample[:int(0.95*len(news_info_train_sample))].iterrows():
        f.write('__label__%d '%r['label'])
        f.write(r['text'])
        f.write('\n')
with open('./clean_data/trainset_valid.txt','w') as f:
    for i,r in news_info_train_sample[int(0.95*len(news_info_train_sample)):].iterrows():
        f.write('__label__%d '%r['label'])
        f.write(r['text'])
        f.write('\n')
with open('./clean_data/submit_valid.txt','w') as f:
    for i,r in news_info_validate.iterrows():
        f.write(r['text'])
        f.write('\n')
        
with open('./clean_data/train_100exaple.txt','w') as f:
    for i,r in news_info_train_example100.iterrows():
        f.write(r['text'])
        f.write('\n')
with open('./clean_data/train_100exaple_label.txt','w') as f:
    for i,r in news_pic_label_train_example100.iterrows():
        f.write('%d'%r['label'])
        f.write('\n')