In [1]:
import os
import re
import json
import jieba
import numpy as np
from datetime import datetime

import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
stop_words = [word.strip() for word in open('stop_word.txt', 'r').readlines()]
stop_words += [' ']

In [3]:
def str2time(s):
    ans = None
    if s.startswith('今天'):
        ans = datetime.today()
        t = datetime.strptime(s.split(' ')[1], '%H:%M')
        ans = ans.replace(hour = t.hour, minute = t.minute)
        return ans
    try:
        ans = datetime.strptime(s, '%m月%d日 %H:%M')
        ans = ans.replace(year = datetime.today().year)
        return ans
    except:
        try:
            ans = datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
            return ans
        except:
            return None

In [13]:
def get_features(user):
    vector = {
        'reputation'            : 0,
        'mention'               : 0,
        'hashtag'               : 0,
        'url'                   : 0,
        'text_similarity'       : 0,
        'time_interval_mean'    : 0,
        'time_interval_var'     : 0,
        'post_num'              : 0,

        'active_day_ratio'      : 0,
        'active_day_num'        : 0,
#         'follow_ratio'          : 0,
#         'figure_RRT'            : 0,
        'comment_num'          : 0,
#         'figure_at_every'       : 0,
#         'late_night_times'      : 0,
#         'figure_at_sigle'       : 0,
#         'figure_at'             : 0,
        
        'followers_num'         : 0, # new added
        'following_num'         : 0,
        'content_length'        : 0,
        
        'upvote_num'            : 0,
        'forwarded_num'           : 0,
        'forward_weibo_num'     : 0
    }
    
    # 声望：关注他的人与他关注的人之比
    vector['reputation'] = float(user['followers_num']) / float(user['following_num'])
    vector['followers_num'] = float(user['followers_num'])
    vector['following_num'] = float(user['following_num'])
    
    # '@'次数和'#'次数：
    vector['mention'] = 0
    vector['hashtag'] = 0
    vector['url']     = 0
    posts   = []
    time_intervals    = []
    prev_time         = None
    post_lens = []
    comment_num = 0
    upvote_num = 0
    forwarded_num = 0
    forward_weibo_num = 0
    
    days_set = set()
    
    for post in user['weibo']:
        if '@' in post['content']:
            vector['mention'] += 1
        if '#' in post['content']:
            vector['hashtag'] += 1
        if 'http://' in post['content']:
            vector['url'] += 1
        words = list(jieba.cut(post['content']))
        words = [word for word in words if word not in stop_words]
        posts.append(' '.join(words))
        
        curr_time = str2time(' '.join(post['time'].split()[:2]))
        if curr_time == None:
            continue
        if prev_time != None:
            time_intervals.append((prev_time - curr_time).seconds)
            days_set.add((prev_time - curr_time).days)
            
        prev_time = curr_time
        
        post_lens.append(len(post['content']))
        comment_num += int(post['comment'])
        upvote_num += int(post['upvote'])
        forwarded_num += int(post['forward'])
        forward_weibo_num += int(post['forward_flag'])
        
    if len(time_intervals) > 0:
        vector['time_interval_mean'] = np.mean(time_intervals)
        vector['time_interval_var'] = np.var(time_intervals)
    
    vector['content_length'] = np.mean(post_lens)
    
    vector['post_num'] = int(user['weibos_num'])
    vector['comment_num'] = comment_num
    vector['active_day_num'] = len(days_set)
    if max(days_set) == 0:
        print(user)
        vector['active_day_ratio'] = 0
    else:
        vector['active_day_ratio'] = 1.0*len(days_set)/max(days_set)
    vector['upvote_num'] = upvote_num
    vector['forwarded_num'] = forwarded_num
    vector['forward_weibo_num'] = forward_weibo_num
    
    # posts之间的文本相似度
    try:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(posts))
        matrix = (tfidf * tfidf.T).A
        vector['text_similarity'] = np.mean(matrix)
    except:
        print(user)
    
    return vector

In [14]:
def get_label(text, id):
    p = re.compile('%s.+?(yes|no)' % id)
    return re.findall(p, text)[0]
    

path = './spider/weibo/'
files = os.listdir(path)
text = open('./data/spammer_order.csv', 'r').read()

vectors = []
for file in files:
    user = json.load(open(os.path.join(path, file), 'r'))
    if len(user['weibo']) <= 0 or int(user['weibos_num']) <= 0:
        continue
    id = file.split('.')[0]
    label = get_label(text, id)
#     if id == '1005383941':
#         print(get_features(user))
# #         print(list(get_features(user).values()))
#     elif id == '1006765334':
#         print(get_features(user))
#     else:
#         continue
    vector = get_features(user)
    vector['id'] = id
    vector['is_spammer'] = label
    vectors.append(vector)

{'id': '5035212843', 'weibos_num': '6933', 'weibo': [{'forward': '0', 'forward_flag': 0, 'comment': '0', 'content': '#奔驰世家 利星大连#晚安！I am a slow walker , but I never walk backwards.我走得很慢，但是我从不倒退。 ', 'time': '05月17日 21:34\xa0来自皮皮时光机', 'upvote': '0'}, {'forward': '0', 'forward_flag': 0, 'comment': '0', 'content': '【人生没有草稿，写完今天的这张就不可能再有同样的另一张】平常的日子总会被我们当做不值钱的"废纸"，涂抹坏了也不心疼，总以为来日方长，"纸张还有很多"，实际上，生活不会给我们打草稿的时间和机会，每一笔下去都无法再涂改，我们每天写下的"草稿"，都会成为人生无法更改的答卷。 ', 'time': '05月17日 20:39\xa0来自皮皮时光机', 'upvote': '0'}, {'forward': '0', 'forward_flag': 0, 'comment': '0', 'content': '【减肥食谱五】早餐：一个蒸糯玉米，1个荷包蛋，一杯牛奶。中餐：西红柿牛肉面(面只吃一半)，凉拌海带胡萝卜丝。晚餐：豆苗鱼丸汤，素炒丝瓜，烤甘薯1块。 ', 'time': '05月17日 20:19\xa0来自皮皮时光机', 'upvote': '0'}, {'forward': '0', 'forward_flag': 0, 'comment': '0', 'content': '【9种日常食物在养生上的效用】1.番茄延缓衰老；2.茶叶防辐射：睡眠不好胃肠差人慎用；3.姜减少色素沉淀；4.醋增强皮肤活力：过敏者低血压慎用；5.黄豆调理雌激素:不好消化；6、米饭防止肌肤干燥；7.菌类调节免疫系统；8.红酒提升气色；9.燕麦减少脂肪 。 ', 'time': '05月17日 19:09\xa0来自皮皮时光机', 'upvote': '0'}, {'forward': '0', 'forward_flag': 0, 'comment': '0', 'conten

In [15]:
for v in vectors:
    print(v)

{'forward_weibo_num': 38, 'post_num': 1005, 'is_spammer': 'no', 'following_num': 358.0, 'time_interval_var': 823993452.04081619, 'text_similarity': 0.034592289736287828, 'url': 1, 'active_day_ratio': 0.36585365853658536, 'content_length': 60.479999999999997, 'id': '1214054254', 'comment_num': 16, 'active_day_num': 15, 'hashtag': 15, 'upvote_num': 9, 'mention': 7, 'time_interval_mean': 30972.857142857141, 'followers_num': 179.0, 'reputation': 0.5, 'forwarded_num': 0}
{'forward_weibo_num': 49, 'post_num': 1819, 'is_spammer': 'no', 'following_num': 180.0, 'time_interval_var': 749124164.25, 'text_similarity': 0.026581635586285113, 'url': 0, 'active_day_ratio': 0.4230769230769231, 'content_length': 48.510204081632651, 'id': '2158838150', 'comment_num': 2, 'active_day_num': 11, 'hashtag': 10, 'upvote_num': 0, 'mention': 3, 'time_interval_mean': 33023.0, 'followers_num': 312.0, 'reputation': 1.7333333333333334, 'forwarded_num': 0}
{'forward_weibo_num': 40, 'post_num': 3542, 'is_spammer': 'no'

In [16]:
with open('./feature.json', 'w') as myfile:
    json.dump(vectors, myfile)