read data

In [1]:
import pandas as pd
import numpy as np
import json

seasons = pd.read_csv("./data/bangumi.csv", delimiter=",", encoding="utf-8")
episodes = pd.read_csv("./data/episode.csv", delimiter=",", encoding="utf-8")
danmakus = pd.read_csv("./data/danmaku.csv", delimiter="\t", encoding="utf-8")

选择出正样本所在的season弹幕数据

In [2]:
danmaku_select = danmakus[danmakus['season_id'] == 21466]

train danmaku initial embedding

In [2]:
from gensim.models.word2vec import Word2Vec
import multiprocessing

class TscTaggedDocument(object):
    def __init__(self, danmakus):
        self.danmakus = danmakus

    def __iter__(self):
        for index, row in self.danmakus.iterrows():
            content_words = [word.encode('utf-8').decode('unicode_escape') for word in row["content"]]
            yield content_words
            
            
tsc_docs = TscTaggedDocument(danmakus)
model = Word2Vec(size=200, window=5, min_count=2, iter=10, workers=multiprocessing.cpu_count())
print('Building vocabulary......')
model.build_vocab(tsc_docs)
print('Training doc2vec model......')
model.train(tsc_docs, total_examples=model.corpus_count, epochs=model.iter)
print('Vocabulary size:', len(model.wv.vocab))
model.save("./models/danmaku_word2vec_200.model")

Building vocabulary......
Training doc2vec model......




Vocabulary size: 70


use model to get representation

In [3]:
from gensim.models.word2vec import Word2Vec
word_model = Word2Vec.load("./models/danmaku_word2vec_200.model")
word_dim = 200
print(len(word_model.wv.vocab))

218565


danmaku content processing

In [4]:
def word_split(content_string):
    content_words = []
    word_list = content_string[1:-1].split(',')
    for word in word_list:
        word = word.strip()
        if word.startswith('u'):
            content_words.append(word[2:-1].encode('utf-8').decode('unicode_escape'))
        elif len(word) > 0:
            content_words.append(word[1:-1])
    return content_words

构造episode-level embedding

In [8]:
episode_grouped = danmakus.groupby('episode_id')
episode_init_model = dict()
for episode_id, group_data in episode_grouped:
    group_data.sort_values(by='playback_time')
    count = 0
    sum = np.zeros(word_dim)
    for index, row in group_data.iterrows():
        words = word_split(row['content'])
        if len(words) == 0:
            continue
        for word in words:
            if word in word_model.wv.vocab:
                sum += word_model.wv[word]
                count += 1
    episode_init_model[episode_id] = (sum/count).tolist()

with open("./models/episode_word2vec_200.json", "w") as f:
    json_str = json.dumps(episode_init_model)
    f.write(json_str)

构造user-level embedding

In [10]:
user_grouped = danmakus.groupby('sender_id')
user_init_model = dict()
for sender_id, group_data in user_grouped:
    group_data.sort_values(by='post_time')
    count = 0
    sum = np.zeros(word_dim)
    for index, row in group_data.iterrows():
        words = word_split(row['content'])
        if len(words) == 0:
            continue
        for word in words:
            if word in word_model.wv.vocab:
                sum += word_model.wv[word]
                count += 1
    user_init_model[sender_id] = (sum/count).tolist()

with open("./models/user_word2vec_200.json", "w") as f:
    json_str = json.dumps(user_init_model)
    f.write(json_str)



KeyboardInterrupt: 

分词预处理

In [5]:
import jieba.posseg as segtool
import re

ACCEPTABLE_TYPE = {'n', 't', 's', 'f', 'v', 'a', 'b', 'z', 'e', 'y', 'o'}
REPLACE_DICT = {
    "233+": "233",
    "666+": "666"
}

def check_type(word_type):
    if word_type[0] in ACCEPTABLE_TYPE:
        return True
    else:
        return False

def check_replace(word):
    for item in REPLACE_DICT.keys():
        pattern = re.compile(item)
        if re.match(pattern, word) is not None:
            new_word = REPLACE_DICT[item]
            return new_word
    return word

def word_segment(content):
    words = []
    results = segtool.cut(content)
    for result in results:
        result.word = check_replace(result.word)
        if check_type(result.flag):
            words.append(result.word)
    return words

检查数据中正样本的情况

In [6]:
import pandas as pd
import numpy as np

positive_comments = pd.read_csv("./data/comment_positive.csv", delimiter=",", encoding="utf-8")

valid_pos_comments_feature = []
valid_pos_comments_set = set()

for index, row in positive_comments.iterrows():
    raw_id = row["raw_id"]
    valid_pos_comments_set.add(raw_id)
    content = danmaku_select[danmaku_select['tsc_raw_id'] == raw_id]["content"].tolist()
    if len(content) > 0:
        content = content[0]
    else:
        continue
    words = word_split(content)
    if len(words) > 0:
        danmaku_repr = np.zeros(word_dim)
        valid_words = 0
        for word in words:
            if word in word_model.wv.vocab:
                valid_words += 1
                danmaku_repr += word_model.wv[word]
        if valid_words > 0:
            valid_pos_comments_feature.append(danmaku_repr/valid_words)
            
pos_comment_df = pd.DataFrame(valid_pos_comments_feature)
pos_comment_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.459209,0.068840,0.518663,-0.718769,0.204816,-0.660420,-0.058094,-0.781625,0.366771,-0.038685,...,-0.086026,0.273948,0.282555,0.598565,-0.159056,-0.311861,-0.043978,0.251012,-0.011492,0.918425
1,0.415522,-0.520065,0.529765,-0.476499,0.330392,-0.792040,0.130760,-0.660380,0.102713,-0.097189,...,-0.267110,-0.376386,-0.121246,0.608357,-0.140185,-0.554184,-0.296807,0.070977,0.509366,0.143152
2,-0.502529,-0.145655,0.972626,-0.801836,0.415211,0.197470,0.173534,0.118130,0.074215,-0.695778,...,-0.229959,0.232918,0.288041,-0.101529,-0.198493,-0.123311,-0.021655,0.819683,-0.138086,0.393017
3,-0.605836,0.253396,0.558612,-1.028826,0.588377,-0.195169,-0.585407,0.063359,1.261360,-0.679766,...,-0.120399,0.316127,-0.391617,0.665402,-0.412780,-0.706795,-0.165125,0.590001,0.408509,1.308532
4,-0.250050,-0.327821,0.652581,-0.944954,0.932400,0.099642,-0.828530,-0.318162,1.252360,-0.804862,...,0.129062,0.495875,-0.229700,0.671438,-1.042553,-1.062898,-0.416367,0.585158,0.057961,1.037994
5,-0.388689,-0.487382,0.350064,-0.847201,0.904711,-0.010024,-0.711818,-0.604251,1.252509,-0.854352,...,0.216819,0.014552,0.101026,0.015726,-1.162356,-1.175684,-0.095987,0.820141,0.243466,1.094006
6,-0.445425,-0.137197,0.792048,-0.542641,0.641200,0.047805,-0.715514,-0.600879,1.093348,-0.962830,...,0.402987,-0.009034,-0.278303,0.529525,-0.464595,-1.142755,-0.316046,0.739474,0.266887,1.128816
7,-0.679646,-0.297002,0.491381,-0.736689,0.369226,0.115069,-0.333817,-0.373104,0.471472,-0.442861,...,-0.279553,0.320144,-0.053601,0.613148,-0.253355,-0.353962,-0.241383,0.076651,0.311255,0.975434
8,-0.372398,-0.610164,1.265838,-0.660937,0.631989,0.498103,-0.758884,-0.894397,0.876487,-0.575547,...,-0.044085,0.377894,0.183333,0.371252,-0.465231,-0.485653,-0.403355,0.816513,0.012177,0.900014
9,-0.522827,-0.055829,0.195314,-0.175003,0.335684,0.078050,-0.508540,-0.177019,0.568508,-0.189050,...,-0.008496,0.253767,-0.226979,0.005852,-0.422980,-0.678832,-0.037642,0.248513,0.184042,0.567655


检查数据中负样本的情况

In [7]:
import pandas as pd
import numpy as np

negative_comments = pd.read_csv("./data/comment_negative.csv", delimiter=",", encoding="utf-8")

valid_neg_comments_feature = []

for index, row in negative_comments.iterrows():
    content = row["Comment"].strip()
    content = content.replace("*","")
    words = word_segment(content)
    if len(words) > 0:
        danmaku_repr = np.zeros(word_dim)
        valid_words = 0
        for word in words:
            if word in word_model.wv.vocab:
                valid_words += 1
                danmaku_repr += word_model.wv[word]
        if valid_words > 0:
            valid_neg_comments_feature.append(danmaku_repr/valid_words)

neg_comment_df = pd.DataFrame(valid_neg_comments_feature)
neg_comment_df

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.550 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.093214,0.234901,0.326423,-0.457833,-0.022173,-0.520010,-0.342608,-0.563632,-0.022795,0.075283,...,-0.462436,0.431695,-0.231745,0.616420,-0.267103,-0.399343,-0.757096,0.274162,0.148801,0.561109
1,-0.046861,0.101023,0.218520,-0.301838,-0.302505,-0.296510,-0.084231,-0.673560,0.107082,-0.188147,...,-0.121978,-0.134977,-0.303613,0.171675,-0.092439,-0.350739,0.137807,0.149944,-0.481022,0.899456
2,-1.118985,1.218978,1.667174,-1.637353,0.723845,0.254608,0.215223,-2.166042,0.184661,-1.528602,...,-0.053584,-0.174243,0.396333,1.152659,0.875525,-0.390142,0.376761,0.654022,0.180446,1.980457
3,-0.126913,0.084239,0.367800,-0.698585,0.219721,-0.075717,-0.157180,-0.847253,0.730104,-0.244634,...,-0.127399,0.470658,-0.143121,0.545181,-0.502790,-0.690864,0.036942,0.106718,-0.097678,1.304486
4,0.355777,0.301485,0.232154,-0.396937,0.122256,-0.156237,-0.345060,-0.210060,0.507227,0.150827,...,-0.388459,0.487599,-0.040993,0.272039,0.278677,0.157769,0.251669,-0.153759,0.484767,0.003680
5,0.234052,0.267938,0.555166,-0.452271,-0.234209,-0.146220,-0.219393,-0.087287,0.214024,0.220228,...,-0.514380,-0.062151,-0.277385,0.634872,-0.109886,-0.226255,0.056029,-0.051635,0.025559,-0.222913
6,0.018057,-0.036373,-0.086777,0.007554,0.020443,-0.000193,0.029691,0.139705,-0.046951,0.043615,...,-0.075237,-0.044530,-0.033343,-0.048179,0.025741,0.000301,-0.028415,-0.044635,0.033520,-0.030570
7,-0.297868,-0.107621,0.639645,-0.254791,0.193994,0.003680,-0.541694,-0.318016,0.510039,-0.287715,...,0.071975,0.266646,0.217845,0.381887,-0.196160,-0.393395,-0.505925,0.246128,0.327584,0.644617
8,0.081378,0.171632,0.562786,-0.493354,0.187196,-0.325732,-0.163702,-0.241334,0.281500,-0.065029,...,0.005486,0.222957,-0.027236,0.372470,0.003413,-0.320036,-0.390210,0.334737,0.291110,0.606478
9,-0.049974,0.356466,0.185890,-0.001774,0.181924,-0.146917,-0.055284,-0.245515,0.321074,0.155236,...,-0.184260,-0.111058,0.365956,0.471106,-0.013849,-0.475427,0.121737,0.012350,-0.318564,0.701413


处理未标注的数据

In [8]:
import pandas as pd
import numpy as np

valid_unlabel_comments_feature = []

for index, row in danmaku_select.iterrows():
    if row['tsc_raw_id'] in valid_pos_comments_set:
        continue
    words = word_split(row['content'])
    if len(words) > 0:
        danmaku_repr = np.zeros(word_dim)
        valid_words = 0
        for word in words:
            if word in word_model.wv.vocab:
                valid_words += 1
                danmaku_repr += word_model.wv[word]
        if valid_words > 0:
            valid_unlabel_comments_feature.append(danmaku_repr/valid_words)

unlabel_comment_df = pd.DataFrame(valid_unlabel_comments_feature)
unlabel_comment_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.710068,-0.425862,-1.234477,-0.886300,-0.515200,0.376286,-1.508969,-0.509257,0.377408,0.030451,...,0.047656,-0.951637,1.175290,1.529796,-0.208388,0.383744,-0.274727,-0.525731,-0.917708,0.927526
1,-0.428284,0.650482,1.149429,-0.872477,0.039854,0.668097,-0.139339,-0.443599,0.251930,-0.028038,...,-0.493739,0.378452,0.333707,-0.468525,-0.092226,-1.360009,0.022293,1.362598,0.068388,0.430743
2,0.362325,0.493365,-0.096665,-0.879787,0.336509,0.374589,-0.481267,-0.244594,-0.441792,-0.756685,...,-0.415609,-0.922351,0.169621,0.338962,0.392300,-1.065802,-0.480112,-0.237790,-1.112316,0.213897
3,0.095680,0.711412,0.845737,-1.758077,0.482232,-0.210422,-0.793766,-0.506434,0.916245,0.342655,...,-0.462445,-0.264960,-0.163756,0.437941,-0.716418,-0.993283,-0.233921,0.212670,0.370048,1.063268
4,0.291820,-0.313926,-1.778921,-1.409899,-1.497622,-1.793691,-0.136184,1.824675,-0.767333,0.476449,...,-1.473278,-0.934512,0.308475,-0.322255,1.690493,-0.087175,-0.611087,-0.147480,1.120687,0.235808
5,-0.152762,-0.149410,1.015536,-0.518363,0.140204,0.109658,-0.394955,-0.494403,0.457740,0.596882,...,0.351076,-0.506121,0.159149,0.207646,-0.076606,-0.299561,0.046723,0.500603,-0.531187,0.656607
6,-0.347332,-0.164360,0.080068,0.589224,-0.612396,-0.230567,-0.817353,-0.163048,0.055350,0.777947,...,-1.162455,-0.749265,-0.191292,0.108478,-0.313826,1.691357,0.280301,-0.510343,-0.446354,-1.010391
7,0.595376,0.686187,0.929171,-0.024496,-0.317161,-0.135015,-1.228422,0.267147,1.301377,-0.337133,...,1.004788,-0.388818,0.210706,0.551006,-0.595026,-0.321587,-0.680125,-0.064982,0.996624,0.245447
8,0.434490,1.817477,-0.563028,-2.065074,-0.138434,-0.862455,-1.074647,-2.959245,1.357973,-0.736732,...,-0.662977,0.664609,0.781766,0.340236,0.289605,-0.213365,0.394351,0.592313,-2.288081,1.238084
9,-0.736681,-0.383736,-0.550500,-0.610167,-0.170146,0.626142,0.013986,-0.943299,0.354512,-0.137658,...,-0.229301,-1.319806,0.784606,-0.279426,0.566086,-0.133665,0.841342,0.312341,0.233349,-0.256725


生成人工验证集

In [15]:
import pandas as pd
import numpy as np

manual_valid_comments_feature = []
manual_valid_index = []

manual_valid_comments = danmaku_select.sample(n=100)

for index, row in manual_valid_comments.iterrows():
    words = word_split(row['content'])
    if len(words) > 0:
        danmaku_repr = np.zeros(word_dim)
        valid_words = 0
        for word in words:
            if word in word_model.wv.vocab:
                valid_words += 1
                danmaku_repr += word_model.wv[word]
        if valid_words > 0:
            manual_valid_index.append(index)
            manual_valid_comments_feature.append(danmaku_repr/valid_words)

manual_valid_df = pd.DataFrame(manual_valid_comments_feature)
print(manual_valid_df.shape)

manual_valid_comments_ = manual_valid_comments.loc[manual_valid_index]
manual_valid_comments_.to_csv("manual.csv", sep="\t", index=False)

(93, 200)


生成数据集

In [12]:
from __future__ import print_function
import random
import numpy as np
from collections import defaultdict
import pickle
from sklearn.model_selection import train_test_split


def shuffle_comments_labels(comments, labels):
    assert comments.shape[0] == labels.shape[0]
    randomize = np.arange(comments.shape[0])
    np.random.shuffle(randomize)
    return comments[randomize], labels[randomize]


def dump_pickle(filepath, d):
    with open(filepath, "wb") as f:
        pickle.dump(d, f)

random.seed(42)
np.random.seed(42)
    
train_unlabelled_comments = unlabel_comment_df.values
train_unlabelled_labels = np.ones(unlabel_comment_df.shape[0]) * -1.


train_labelled_pos_data_comments, test_labelled_pos_data_comments = train_test_split(pos_comment_df.values, 
                                                                                     test_size=0.2, 
                                                                                     random_state=42, 
                                                                                     shuffle=True)
    
train_labelled_neg_data_comments, test_labelled_neg_data_comments = train_test_split(neg_comment_df.values, 
                                                                                     test_size=0.2, 
                                                                                     random_state=42, 
                                                                                     shuffle=True)
    
train_labelled_pos_data_comments, valid_labelled_pos_data_comments = train_test_split(train_labelled_pos_data_comments,
                                                                                      test_size=0.2, 
                                                                                      random_state=42, 
                                                                                      shuffle=True)
    
train_labelled_neg_data_comments, valid_labelled_neg_data_comments = train_test_split(train_labelled_neg_data_comments, 
                                                                                      test_size=0.2, 
                                                                                      random_state=42, 
                                                                                      shuffle=True)
    

train_labelled_comments = np.concatenate((train_labelled_pos_data_comments, train_labelled_neg_data_comments),
                                         axis=0)
train_labelled_labels = np.append(np.zeros(train_labelled_pos_data_comments.shape[0]), 
                                       np.ones(train_labelled_neg_data_comments.shape[0]))
    
validation_comments = np.concatenate((valid_labelled_pos_data_comments, valid_labelled_neg_data_comments),
                                     axis=0)
validation_labels = np.append(np.zeros(valid_labelled_pos_data_comments.shape[0]), 
                                       np.ones(valid_labelled_neg_data_comments.shape[0]))

test_comments = np.concatenate((test_labelled_pos_data_comments, test_labelled_neg_data_comments),
                               axis=0)
test_labels = np.append(np.zeros(test_labelled_pos_data_comments.shape[0]), 
                                       np.ones(test_labelled_neg_data_comments.shape[0]))
    

train_labelled_comments, train_labelled_labels = shuffle_comments_labels(train_labelled_comments, 
                                                                         train_labelled_labels)

print("=" * 50)
print("train_labelled_images shape:", train_labelled_comments.shape)
print("train_labelled_labels shape:", train_labelled_labels.shape)
print()
print("train_unlabelled_images shape:", train_unlabelled_comments.shape)
print("train_unlabelled_labels shape:", train_unlabelled_labels.shape)
print()
print("validation_images shape:", validation_comments.shape)
print("validation_labels shape:", validation_labels.shape)
print()
print("test_images shape:", test_comments.shape)
print("test_labels shape:", test_labels.shape)
print("=" * 50)

print("Dumping pickles")
    
data_dir = "data/"

dump_pickle(data_dir + "train_labelled_images.p", train_labelled_comments)
dump_pickle(data_dir + "train_labelled_labels.p", train_labelled_labels)
dump_pickle(data_dir + "train_unlabelled_images.p", train_unlabelled_comments)
dump_pickle(data_dir + "train_unlabelled_labels.p", train_unlabelled_labels)
dump_pickle(data_dir + "validation_images.p", validation_comments)
dump_pickle(data_dir + "validation_labels.p", validation_labels)
dump_pickle(data_dir + "test_images.p", test_comments)
dump_pickle(data_dir + "test_labels.p", test_labels)

print("Danmaku dataset successfully created")

train_labelled_images shape: (1482, 200)
train_labelled_labels shape: (1482,)

train_unlabelled_images shape: (513552, 200)
train_unlabelled_labels shape: (513552,)

validation_images shape: (372, 200)
validation_labels shape: (372,)

test_images shape: (464, 200)
test_labels shape: (464,)
Dumping pickles
Danmaku dataset successfully created


导出人工验证集的数据

In [16]:
def dump_pickle(filepath, d):
    with open(filepath, "wb") as f:
        pickle.dump(d, f)

manual_vaild_comments = manual_valid_df.values

data_dir = "data/"

dump_pickle(data_dir + "manual_vaild_images.p", manual_vaild_comments)

人工验证文件生成

In [6]:
import pandas as pd
import numpy as np
import csv

danmakus_complete = pd.read_csv("./data/danmaku_complete.csv", delimiter="\t", encoding="utf-8", 
                                quoting=csv.QUOTE_NONE, low_memory=False)
manual_select = pd.read_csv("./validation/manual_batch_1.csv", delimiter="\t", encoding="utf-8")
predicts = np.loadtxt("./validation/epoch_predicts_batch_1.txt")

In [10]:
tsc_raw_ids = manual_select["tsc_raw_id"]
criterion = lambda row: row["tsc_raw_id"] in tsc_raw_ids.tolist()
manual_select_danmaku = danmakus_complete[danmakus_complete.apply(criterion, axis=1)]

(93, 7)

In [27]:
raw_content = dict()
for index, row in manual_select.iterrows():
    raw_content[index] = manual_select_danmaku[manual_select_danmaku['tsc_raw_id']==row["tsc_raw_id"]]["content"].iloc[0]

manual_select["raw_content"] = pd.Series(raw_content, dtype=str)

manual_select["predict"] = pd.Series(predicts, dtype=int)
manual_select["label"] = pd.Series(np.zeros(predicts.shape), dtype=int)

Unnamed: 0,season_id,episode_id,tsc_raw_id,playback_time,sender_id,content,post_time,raw_content,predict,label
0,21466,173257,4387479890,564.773,1f8b3327,"[u'\u7f8e\u98df', u'\u65e2\u89c6', u'\u611f']",1521036681,美食番既视感,1,0
1,21466,173252,4297024965,42.161,48645d69,"[u'\u7279\u5730', u'\u4eba', u'\u770b', u'\u56...",1518573606,特地一个人看，嘿嘿嘿,1,0
2,21466,173257,4386311222,279.912,6c3b8a3c,"[u'\u6f58\u591a\u62c9', u'\u4e16\u4e0a', u'\u5...",1520992321,潘多拉：世上只有安兹好～有爹的孩子像个宝～,1,0
3,21466,173249,4186544632,622.569,564cb566,"[u'\u54c7', u'\u597d', u'\u6ed1\u7a3d']",1516115897,哇 好美（滑稽,1,0
4,21466,173255,4361498712,463.727,3b196ed9,"[u'\u9aa8\u738b', u'\u524d', u'\u73a9\u73a9', ...",1520063668,骨王：前几个就是玩玩，森林贤王这个真的就不用说了,1,0
5,21466,173256,4387683025,1263.190,a9408a2a,"[u'\u8c8c\u4f3c', u'\u634f', u'\u5b8c\u8138', ...",1521041299,貌似捏完脸还真没区别,1,0
6,21466,173254,4326890533,920.851,d9a95d0f,"[u'\u53bb\u6c42', u'\u83ab', u'\u83ab\u5927', ...",1519192292,去求莫莫大人啊！,1,0
7,21466,173251,4244282729,1275.040,547c1d99,"[u'\u5b9e\u529b', u'\u5632\u8bbd']",1517481362,实力嘲讽2333,1,0
8,21466,173259,4418183408,128.400,9cd20819,[u'\u5934\u76d6\u9aa8'],1522211367,头盖骨,0,0
9,21466,173257,4388311108,750.826,5d4de5d3,"[u'\u96c6\u753b', u'\u5d29', u'\u5440']",1521087663,这集画崩了呀,1,0


In [28]:
manual_select.to_csv("manual_check_batch_1.csv", index=False, sep="\t")