# 处理返回的json

In [1]:
import pandas as pd
import json
import demjson
import numpy as np
import joblib

In [13]:
def parse_text(text):
    #     try:
    #         # 单引号变为双引号
    #         return json.loads(text.replace('\'', '"'))
    #     except:
    #         text = text.replace('"', '')
    #         return json.loads(text.replace('\'', '"'))

    return demjson.decode(text)


def extract_emotions(items_dict):
    if 'error_msg' in items_dict.keys():
        msg = items_dict['error_msg']
        if msg == 'input text too long':
            return {'neutral': 1.0, 'optimistic': 0.0, 'pessimistic': 0.0}

    try:
        items = items_dict['items']
        assert len(items) == 3
    except:
        print(items_dict)
        return {'neutral': 1.0, 'optimistic': 0.0, 'pessimistic': 0.0}

    emotions_dict = dict()
    for item in items:
        emotions_dict[item['label']] = item['prob']
        subitems = item['subitems']
        for subitem in subitems:
            emotions_dict[subitem['label']] = subitem['prob']

    return emotions_dict

In [3]:
with open('../dataset/train.json', 'r') as f:
    train = json.load(f)
with open('../dataset/test.json', 'r') as f:
    test = json.load(f)

len(train), len(test)

(32193, 1613)

## Content

In [5]:
with open('./baidu_train.txt', 'r') as src:
    train_lines = src.readlines()
    train_lines = [line.strip() for line in train_lines]
    
with open('./baidu_test.txt', 'r') as src:
    test_lines = src.readlines()
    test_lines = [line.strip() for line in test_lines]

len(train_lines), len(test_lines)

(32193, 1613)

In [7]:
train_lines[0]

"{'log_id': 3283784587524047522, 'items': [{'prob': 0.995265, 'label': 'neutral', 'subitems': [], 'replies': []}, {'prob': 0.00297933, 'label': 'optimistic', 'subitems': [], 'replies': []}, {'prob': 0.00175571, 'label': 'pessimistic', 'subitems': [], 'replies': []}], 'text': '回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：'}"

In [6]:
extract_emotions(parse_text(train_lines[0]))

{'neutral': 0.995265, 'optimistic': 0.00297933, 'pessimistic': 0.00175571}

In [8]:
train[0]['content']

'回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：'

In [9]:
train[0]

{'category': '文体娱乐',
 'category_label': 2,
 'content': '回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：',
 'content_words': '回复 新浪 网友 对 博文 【 国家文物局 限制 鉴宝 节目 现场 估价 转 】 的 评论 ： ; ; 查看 原文 ：',
 'fake_label': 0}

In [17]:
for i, piece in enumerate(train):
    piece['content_emotions'] = extract_emotions(parse_text(train_lines[i]))

{'error_code': 18, 'error_msg': 'Open api qps request limit reached'}
{'error_code': 282134, 'log_id': 2348648912883782530, 'error_msg': 'input empty'}


In [14]:
for i, piece in enumerate(test):
    piece['content_emotions'] = extract_emotions(parse_text(test_lines[i]))

In [18]:
train[0]

{'category': '文体娱乐',
 'category_label': 2,
 'content': '回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：',
 'content_emotions': {'neutral': 0.995265,
  'optimistic': 0.00297933,
  'pessimistic': 0.00175571},
 'content_words': '回复 新浪 网友 对 博文 【 国家文物局 限制 鉴宝 节目 现场 估价 转 】 的 评论 ： ; ; 查看 原文 ：',
 'fake_label': 0}

In [19]:
train_lines[0]

"{'log_id': 3283784587524047522, 'items': [{'prob': 0.995265, 'label': 'neutral', 'subitems': [], 'replies': []}, {'prob': 0.00297933, 'label': 'optimistic', 'subitems': [], 'replies': []}, {'prob': 0.00175571, 'label': 'pessimistic', 'subitems': [], 'replies': []}], 'text': '回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：'}"

In [20]:
with open('../dataset/train.json', 'w') as f:
    json.dump(train, f, ensure_ascii=False, indent=4, sort_keys=True)
with open('../dataset/test.json', 'w') as f:
    json.dump(test, f, ensure_ascii=False, indent=4, sort_keys=True)

# 生成label的df

In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
with open('../dataset/train.json', 'r') as f:
    train = json.load(f)
with open('../dataset/test.json', 'r') as f:
    test = json.load(f)

len(train), len(test)

(32193, 1613)

情绪二级分类标签；客服模型正向（thankful感谢、happy愉快）、客服模型负向（complaining抱怨、angry愤怒）；闲聊模型正向（like喜爱、happy愉快）、闲聊模型负向（angry愤怒、disgusting厌恶、fearful恐惧、sad悲伤）

In [3]:
emotions = ['complaining', 'angry', 'disgusting', 'fearful',
            'sad', 'thankful', 'happy', 'like', 'neutral']
emotions.sort()

emotions

['angry',
 'complaining',
 'disgusting',
 'fearful',
 'happy',
 'like',
 'neutral',
 'sad',
 'thankful']

In [4]:
train[0]['content_emotions']

{'neutral': 0.995265, 'optimistic': 0.00297933, 'pessimistic': 0.00175571}

In [6]:
train[0]['content']

'回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：'

In [7]:
def get_label_from_emotions(emotions_dict):
    emotions_dict = sorted(emotions_dict.items(),
                           key=lambda item: item[1], reverse=True)
    
    if emotions_dict[0][0] in ['pessimistic', 'optimistic']:
        if emotions_dict[1][0] in ['pessimistic', 'optimistic']:
            return emotions_dict[2][0]
        else:
            return emotions_dict[1][0]
    else:
        return emotions_dict[0][0]

In [8]:
get_label_from_emotions(train[0]['content_emotions'])

'neutral'

In [10]:
train[0]

{'category': '文体娱乐',
 'category_label': 2,
 'content': '回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：',
 'content_emotions': {'neutral': 0.995265,
  'optimistic': 0.00297933,
  'pessimistic': 0.00175571},
 'content_words': '回复 新浪 网友 对 博文 【 国家文物局 限制 鉴宝 节目 现场 估价 转 】 的 评论 ： ; ; 查看 原文 ：',
 'fake_label': 0}

In [18]:
def gen_emotioin_df(pieces, dataset_type):
    idx = [i for i in range(len(pieces))]
    contents = [piece['content'] for piece in pieces]
    content_labels = []
    fake_labels = []
    category_labels = []

    for piece in pieces:
        content_labels.append(
            get_label_from_emotions(piece['content_emotions']))
        fake_labels.append(piece['fake_label'])
        category_labels.append(piece['category'])

    df = pd.DataFrame({'index': idx, 'content': contents, 'label': fake_labels, 'category': category_labels,
                       'publisher_emotion': content_labels})
    df['dataset'] = dataset_type
    return df

In [19]:
train_df = gen_emotioin_df(train, 'train')

test_df = gen_emotioin_df(test, 'test')

len(train_df), len(test_df)

(32193, 1613)

In [20]:
train_df.head()

Unnamed: 0,category,content,index,label,publisher_emotion,dataset
0,文体娱乐,回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：,0,0,neutral,train
1,社会生活,//分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....,1,0,neutral,train
2,社会生活,西宁城管围殴民警扬言要把警察打死|西宁城管围...,2,0,angry,train
3,社会生活,【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...,3,0,neutral,train
4,社会生活,支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】,4,0,neutral,train


In [21]:
df = pd.concat([train_df, test_df])
len(df)

33806

In [22]:
rumor_df = df[df['label'] == 1]
truth_df = df[df['label'] == 0]
len(rumor_df), len(truth_df)

(16841, 16965)

In [23]:
rumor_df.to_csv('./emotion_rumor.csv', index=None)
truth_df.to_csv('./emotion_truth.csv', index=None)