In [None]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

In [0]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. 
# All changes under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
!ls /home/aistudio/work

In [None]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, 
# you need to use the persistence path as the following: 
!mkdir /home/aistudio/external-libraries
!pip install beautifulsoup4 -t /home/aistudio/external-libraries

In [None]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
sys.path.append('/home/aistudio/external-libraries')

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 

In [1]:
import codecs
import os
import jieba

train_file='./data/cnews.train.txt' # training data file name
test_file='./data/cnews.test.txt'  # test data file name
vocab='./data/cnews_dict.txt' # dictionary

with codecs.open(train_file, 'r', 'utf-8') as f:
    lines = f.readlines()

# print sample content
label, content = lines[0].strip('\r\n').split('\t')
print(content)


# print word segment results
segment = jieba.cut(content)
print('/'.join(segment))


# cut data
def process_line(idx, line):
    data = tuple(line.strip('\r\n').split('\t'))
    if not len(data)==2:
        return None
    content_segged = list(jieba.cut(data[1]))
    if idx % 1000 == 0:
        print('line number: {}'.format(idx))
    return (data[0], content_segged)

# data loading method
def load_data(file):
    with codecs.open(file, 'r', 'utf-8') as f:
        lines = f.readlines()
    data_records = [process_line(idx, line) for idx, line in enumerate(lines)]
    data_records = [data for data in data_records if data is not None]
    return data_records


# load and process training data
train_data = load_data(train_file)
print('first training data: label {} segment {}'.format(train_data[0][0], '/'.join(train_data[0][1])))
# load and process testing data
test_data = load_data(test_file)
print('first testing data: label {} segment {}'.format(test_data[0][0], '/'.join(test_data[0][1])))



def build_vocab(train_data, thresh):
    vocab = {'<UNK>': 0}
    word_count = {} # word frequency
    for idx, data in enumerate(train_data):
        content = data[1]
        for word in content:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    word_list = [(k, v) for k, v in word_count.items()]
    print('word list length: {}'.format(len(word_list)))
    word_list.sort(key = lambda x : x[1], reverse = True) # sorted by word frequency
    word_list_filtered = [word for word in word_list if word[1] > thresh]
    print('word list length after filtering: {}'.format(len(word_list_filtered)))
    # construct vocab
    for word in word_list_filtered:
        vocab[word[0]] = len(vocab)
    print('vocab size: {}'.format(len(vocab))) # vocab size is word list size +1 due to unk token
    return vocab

# vocab = build_vocab(train_data, 1)
def build_label_vocab(cate_file):
    label_vocab = {}
    with codecs.open(cate_file, 'r', 'utf-8') as f:
        for lines in f:
            line = lines.strip().split('\t')
            label_vocab[line[0]] = int(line[1])
    return label_vocab

label_vocab = build_label_vocab('./data/cnews.category.txt')
print('label vocab: {}'.format(label_vocab))


def construct_trainable_matrix(corpus, vocab, label_vocab, out_file):
    records = []
    for idx, data in enumerate(corpus):
        if idx % 1000 == 0:
            print('process {} data'.format(idx))
        label = str(label_vocab[data[0]])  # label id
        token_dict = {}
        for token in data[1]:
            token_id = vocab.get(token, 0)
            if token_id in token_dict:
                token_dict[token_id] += 1
            else:
                token_dict[token_id] = 1
        feature = [str(int(k) + 1) + ':' + str(v) for k, v in token_dict.items()]
        feature_text = ' '.join(feature)
        records.append(label + ' ' + feature_text)

    with open(out_file, 'w') as f:
        f.write('\n'.join(records))
vocab = build_vocab(train_data, 1)
# vocab = [word.strip() for word in open('./data/cnews.vocab.txt','r',encoding='utf-8').readlines()]
construct_trainable_matrix(train_data, vocab, label_vocab, './data/train.svm.txt')
construct_trainable_matrix(test_data, vocab, label_vocab, './data/test.svm.txt')


from libsvm import svm
from libsvm.svmutil import svm_train,svm_predict,svm_save_model,svm_load_model
from libsvm.commonutil import svm_read_problem
# train svm
train_label, train_feature = svm_read_problem('./data/train.svm.txt')
print(train_label[0], train_feature[0])
model=svm_train(train_label,train_feature,'-s 0 -c 5 -t 0 -g 0.5 -e 0.1')

# predict
test_label, test_feature = svm_read_problem('./data/test.svm.txt')
print(test_label[0], test_feature[0])
p_labs, p_acc, p_vals = svm_predict(test_label, test_feature, model)

print('accuracy: {}'.format(p_acc))


Building prefix dict from the default dictionary ...


马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有摆脱雨水的困扰。7月31日下午6点，国奥队的日常训练再度受到大雨的干扰，无奈之下队员们只慢跑了25分钟就草草收场。31日上午10点，国奥队在奥体中心外场训练的时候，天就是阴沉沉的，气象预报显示当天下午沈阳就有大雨，但幸好队伍上午的训练并没有受到任何干扰。下午6点，当球队抵达训练场时，大雨已经下了几个小时，而且丝毫没有停下来的意思。抱着试一试的态度，球队开始了当天下午的例行训练，25分钟过去了，天气没有任何转好的迹象，为了保护球员们，国奥队决定中止当天的训练，全队立即返回酒店。在雨中训练对足球队来说并不是什么稀罕事，但在奥运会即将开始之前，全队变得“娇贵”了。在沈阳最后一周的训练，国奥队首先要保证现有的球员不再出现意外的伤病情况以免影响正式比赛，因此这一阶段控制训练受伤、控制感冒等疾病的出现被队伍放在了相当重要的位置。而抵达沈阳之后，中后卫冯萧霆就一直没有训练，冯萧霆是7月27日在长春患上了感冒，因此也没有参加29日跟塞尔维亚的热身赛。队伍介绍说，冯萧霆并没有出现发烧症状，但为了安全起见，这两天还是让他静养休息，等感冒彻底好了之后再恢复训练。由于有了冯萧霆这个例子，因此国奥队对雨中训练就显得特别谨慎，主要是担心球员们受凉而引发感冒，造成非战斗减员。而女足队员马晓旭在热身赛中受伤导致无缘奥运的前科，也让在沈阳的国奥队现在格外警惕，“训练中不断嘱咐队员们要注意动作，我们可不能再出这样的事情了。”一位工作人员表示。从长春到沈阳，雨水一路伴随着国奥队，“也邪了，我们走到哪儿雨就下到哪儿，在长春几次训练都被大雨给搅和了，没想到来沈阳又碰到这种事情。”一位国奥球员也对雨水的“青睐”有些不解。


Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.884 seconds.
Prefix dict has been built successfully.


马晓旭/意外/受伤/让/国奥/警惕/ /无奈/大雨/格外/青睐/殷家/军/记者/傅亚雨/沈阳/报道/ /来到/沈阳/，/国奥队/依然/没有/摆脱/雨水/的/困扰/。/7/月/31/日/下午/6/点/，/国奥队/的/日常/训练/再度/受到/大雨/的/干扰/，/无奈/之下/队员/们/只/慢跑/了/25/分钟/就/草草收场/。/31/日/上午/10/点/，/国奥队/在/奥体中心/外场/训练/的/时候/，/天/就是/阴沉沉/的/，/气象预报/显示/当天/下午/沈阳/就/有/大雨/，/但/幸好/队伍/上午/的/训练/并/没有/受到/任何/干扰/。/下午/6/点/，/当/球队/抵达/训练场/时/，/大雨/已经/下/了/几个/小时/，/而且/丝毫/没有/停下来/的/意思/。/抱/着/试一试/的/态度/，/球队/开始/了/当天/下午/的/例行/训练/，/25/分钟/过去/了/，/天气/没有/任何/转好/的/迹象/，/为了/保护/球员/们/，/国奥队/决定/中止/当天/的/训练/，/全队/立即/返回/酒店/。/在/雨/中/训练/对/足球队/来说/并/不是/什么/稀罕/事/，/但/在/奥运会/即将/开始/之前/，/全队/变得/“/娇贵/”/了/。/在/沈阳/最后/一周/的/训练/，/国奥队/首先/要/保证/现有/的/球员/不再/出现意外/的/伤病/情况/以免/影响/正式/比赛/，/因此/这一/阶段/控制/训练/受伤/、/控制/感冒/等/疾病/的/出现/被/队伍/放在/了/相当/重要/的/位置/。/而/抵达/沈阳/之后/，/中/后卫/冯萧霆/就/一直/没有/训练/，/冯萧霆/是/7/月/27/日/在/长春/患上/了/感冒/，/因此/也/没有/参加/29/日/跟/塞尔维亚/的/热身赛/。/队伍/介绍/说/，/冯萧霆/并/没有/出现/发烧/症状/，/但/为了/安全/起/见/，/这/两天/还是/让/他/静养/休息/，/等/感冒/彻底/好/了/之后/再/恢复/训练/。/由于/有/了/冯萧霆/这个/例子/，/因此/国奥队/对雨中/训练/就/显得/特别/谨慎/，/主要/是/担心/球员/们/受凉/而/引发/感冒/，/造成/非战斗/减员/。/而/女足/队员/马晓旭/在/热身赛/中/受伤/导致/无缘/奥运/的/前科/，/也/让/在/沈阳/的/国奥队/现在/格外/警惕/，/“/训练/中/不断/嘱咐/队员/们

ModuleNotFoundError: No module named 'libsvm'