In [1]:
import jieba
import re
import json
import numpy as np
import random

In [2]:
def read_data(course_name):
    with open('data/' + course_name + '.json', 'rb') as reply_data:
        data = json.load(reply_data)
    return data

In [27]:
def organize_data(Data):
    All_User = []
    All_Post = []
    for data in Data:
        user = []
        post = []
        user.append(data['user'])
        post.append(data['content'])
        for rep in data['reply']:
            user.append(rep['user'])
            post.append(rep['content'])
            for rep_to_rep in rep['reply']:
                user.append(rep_to_rep[0])
                post.append(rep_to_rep[1])
        All_User.extend(user)
        All_Post.extend(post)
    return All_User, All_Post

In [29]:
def remove_chars(text):  # 文本处理/去除文本中的字符并分词,产生词表
    pat = re.compile(u"[\u4e00-\u9fa5]+|[a-zA-Z]+|[0-9]+")
    remove_texts = pat.findall(text)
    list_of_word = []
    for remove_text in remove_texts:
        for word in jieba.cut(remove_text):
            if len(word) >= 1:
                list_of_word.append(word)
    return list_of_word

In [33]:
def load_train_data(course_name):
    with open(filename + '_test_data.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
    f.close()
    train_data = []
    for line in lines:
        train_data.append(line.rstrip('\n'))
    '''
    train_data = All_Post[:1000]
    '''
    posting_list = []
    for data in train_data:
        posting_list.append(remove_chars(data))
    with open(filename + '_test_data_label.txt', 'r') as f:
        lines = f.readlines()
    data_set_label = []
    for line in lines:
        data_set_label.append(int(line.rstrip('\n')))
    return posting_list, data_set_label, train_data

In [32]:
def createvocablist(data_set):
    """
    生成词表
    :param data_set: 单独语料的词汇集
    :return:
    """
    vocab_set = set()
    for item in data_set:
        vocab_set = vocab_set | set(item)
    return list(vocab_set)

In [34]:
def bag_words2vec(vocab_list, input_set):
    """
    setofword2vec（词集模型的进阶版本）词袋模型
    :param vocab_list: 单词集合
    :param input_set: 单个语料的单词矩阵
    :return: 单个语料词向量
    """
    # 词袋模型
    result = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            result[vocab_list.index(word)] += 1
        else:
            '''
            print('the word:{} in not in my vocabulart'
                  .format(word)
            '''
    return result

In [35]:
def sigmoid(x):
    if x > 0:
        return 1.0 / (1 + np.exp(-x))
    else:
        return np.exp(x) / (1 + np.exp(-x))


def stoc_grad_ascent(data_mat, class_labels, num_iter=150):
    """
    改进的随机梯度上升（解决回归系数大量迭代才能到达稳定值，并任然局部波动的问题）
    使用一个随机样本来更新回归系数
    :param data_mat: 输入数据的数据特征，ndarray
    :param class_labels: 输入数据的类别标签
    :param num_iter: 迭代次数
    :return: 得到最佳回归系数
    """
    m, n = np.shape(data_mat)
    weights = np.ones(n)
    for j in range(num_iter):
        data_index = list(range(m))
        for i in range(m):
            alpha = 4 / (1.0 + i + j) + 0.01
            # random.uniform(x, y) 方法将随机生成下一个实数，它在[x,y]范围内,x是这个范围内的最小值，y是这个范围内的最大值
            rand_index = int(np.random.uniform(0, len(data_index)))
            h = sigmoid(np.sum(data_mat[data_index[rand_index]] * weights))
            error = class_labels[data_index[rand_index]] - h
            weights = weights + alpha * error * data_mat[data_index[rand_index]]
            del(data_index[rand_index])
    return weights


def classify_vector(in_x, weights):
    """
    最终的分类函数，根据回归系数和特征向量来计算 Sigmoid 的值，大于0.5函数返回1，否则返回0
    :param in_x: 特征向量
    :param weights: 利用梯度下降法计算得到的回归系数
    :return: 分类结果
    """
    prob = sigmoid(np.sum(in_x * weights))
    if prob > 0.5:
        return 1.0
    return 0.0


def colic_test(training_set, training_labels, num):
    """
    测试数据集和训练数据集的导入和格式化处理
    :return: 分类错误率
    """
    # 使用改进的随机梯度下降算法得到最佳的回归系数
    training_weights = stoc_grad_ascent(np.array(training_set[:num] + training_set[num+100:]),
                                        training_labels[:num] + training_labels[num+100:], 500)
    error_count = 0
    num_test_vec = 0.0

    # 读取测试数据集 进行测试，计算分类错误的样本条数和最终错误率
    for i in range(num, num+100):
        num_test_vec += 1
        if int(classify_vector(np.array(training_set[i]), training_weights)) != int(training_labels[i]):
            print(i)
            error_count += 1
    error_rate = error_count / num_test_vec
    print("the error rate is {}".format(error_rate))
    return error_rate

In [36]:
def calculate_error_rate(filename):
    data = read_data(filename)
    All_User, All_Post = organize_data(data)
    posting_list, data_set_label, train_data = load_train_data(filename)
    error = []
    vocab_list = createvocablist(All_Post)
    train_matix = []
    for posts in posting_list:
        train_matix.append(bag_words2vec(vocab_list, posts))
    for num in range(0, 1000, 100):
        error.append(colic_test(train_matix, data_set_label, num))
    ave_error = np.average(np.array(error))
    std = np.std(np.array(error), ddof=1)
    return ave_error, std

In [37]:
data[:10]

[{'user': 'm15589275972...',
  'content': '问题3是怎么进行周末和工作日的累加的:',
  'reply': [{'user': 'mooc15060420...',
    'content': '用的分支语句啊，遇到周末就减，遇到工作日就加',
    'reply': []}]},
 {'user': '小臣ykt15343...',
  'content': '每周的测试习题在哪里看到？捂脸:',
  'reply': [{'user': '高傲給誰看',
    'content': '在Python123平台',
    'reply': [['mooc44422683...', 'Python123有习题的链接吗'],
     ['小臣ykt15343...', '感谢，找到了']]}]},
 {'user': '小吵小闹小任...',
  'content': '结课问题:课程作业都发布了吗？如何申请结课？',
  'reply': [{'user': '高傲給誰看', 'content': '我也想问', 'reply': []}]},
 {'user': 'ykt152488709...',
  'content': '“不确定尾数”真的好奇怪。。。:如果有个考试判断题0.1+0.2==0.3让我判断真假，我肯定就答错了。',
  'reply': [{'user': 'BIT112016116...助教',
    'content': '你可以到IDLE中运行一下，亲自看一下结果',
    'reply': []}]},
 {'user': 'hawkofbattle',
  'content': '用户登录的三次机会:代码1：foriinrange(3):username=input()password=input()ifusername==\'Kate\'andpassword==\'666666\':print("登录成功")else:continueprint("3次用户名或者密码均有误！退出程序。")代码2：num=0foriinrange(3):username=input()password=input()ifusername!=\'Kate\'orpassword!=\'66666