In [60]:
import os
import re
import json
import jieba
import numpy as np
from datetime import datetime

import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [61]:
stop_words = [word.strip() for word in open('stop_word.txt', 'r').readlines()]
stop_words += [' ']

In [62]:
def str2time(s):
    ans = None
    if s.startswith('今天'):
        ans = datetime.today()
        t = datetime.strptime(s.split(' ')[1], '%H:%M')
        ans = ans.replace(hour = t.hour, minute = t.minute)
        return ans
    try:
        ans = datetime.strptime(s, '%m月%d日 %H:%M')
        ans = ans.replace(year = datetime.today().year)
        return ans
    except:
        try:
            ans = datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
            return ans
        except:
            return None

In [63]:
def get_features(user):
    vector = {
        'reputation'            : 0,
        'mention'               : 0,
        'hashtag'               : 0,
        'url'                   : 0,
        'text_similarity'       : 0,
        'time_interval_mean'    : 0,
        'time_interval_var'     : 0,
        'post_num'              : 0,
        'active_day_ratio'      : 0,
        'follow_ratio'          : 0,
        'figure_RRT'            : 0,
        'average_comm'          : 0,
        'followee_num'          : 0,
        'figure_at_every'       : 0,
        'day_interval_variance' : 0,
        'late_night_times'      : 0,
        'figure_at_sigle'       : 0,
        'figure_at'             : 0
    }
    
    # 声望：关注他的人与他关注的人之比
    vector['reputation'] = float(user['followers_num']) / float(user['following_num'])
    
    
    # '@'次数和'#'次数：
    vector['mention'] = 0
    vector['hashtag'] = 0
    vector['url']     = 0
    posts   = []
    time_intervals    = []
    prev_time         = None
    
    
    for post in user['weibo']:
        if '@' in post['content']:
            vector['mention'] += 1
        if '#' in post['content']:
            vector['hashtag'] += 1
        if 'http://' in post['content']:
            vector['url'] += 1
        words = list(jieba.cut(post['content']))
        words = [word for word in words if word not in stop_words]
        posts.append(' '.join(words))
        
        curr_time = str2time(' '.join(post['time'].split()[:2]))
        if curr_time == None:
            continue
        if prev_time != None:
            time_intervals.append((prev_time - curr_time).seconds)
        prev_time = curr_time
    
    
    if len(time_intervals) > 0:
        vector['time_interval_mean'] = np.mean(time_intervals)
        vector['time_interval_var'] = np.var(time_intervals)
    
    vector['post_num'] = int(user['weibos_num'])
    

    
    # posts之间的文本相似度
    try:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(posts))
        matrix = (tfidf * tfidf.T).A
        vector['test_similarity'] = np.mean(matrix)
    except:
        print(user)
    
    return vector

In [67]:
def get_label(text, id):
    p = re.compile('%s.+?(yes|no)' % id)
    return re.findall(p, text)[0]
    

path = './spider/weibo/'
files = os.listdir(path)
text = open('./data/spammer_order.csv', 'r').read()

vectors = []
for file in files:
    user = json.load(open(os.path.join(path, file), 'r'))
    if len(user['weibo']) <= 0 or int(user['weibos_num']) <= 0:
        continue
    id = file.split('.')[0]
    label = get_label(text, id)
    vector = [id] + list(get_features(user).values()) + [label]
    vectors.append(vector)

In [68]:
for v in vectors:
    print(v)

['1005383941', 0.13184584178498987, 2, 0, 3, 0, 5383.9787234042551, 163474077.04210049, 536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.029655969427463253, 'yes']
['1006765334', 0.5829787234042553, 10, 4, 6, 0, 36113.897959183676, 810268093.11203671, 78, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.036212120364095941, 'yes']
['1014445680', 1.5664335664335665, 8, 27, 3, 0, 31602.061224489797, 931927875.77176189, 998, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1430052813177469, 'yes']
['1079876540', 0.8280442804428044, 17, 30, 30, 0, 34824.714285714283, 1016516433.0136056, 306, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.086992500824162364, 'yes']
['1094291897', 1.1232876712328768, 11, 0, 26, 0, 24635.632653061224, 665195122.35485208, 416, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.046320327780919067, 'yes']
['1106010182', 1.768421052631579, 1, 1, 0, 0, 5547.0, 42221882.0, 3459, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.031061646936799633, 'yes']
['1150512354', 0.08914525432616675, 0, 14, 4, 0, 37801.276595744683, 612165305.81711185, 171, 0, 0, 0, 0, 0, 

['2838737720', 1.0373831775700935, 10, 10, 14, 0, 41978.448979591834, 881347079.79841721, 270, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.38681607374946386, 'yes']
['2847217620', 0.6036585365853658, 11, 13, 5, 0, 27593.204081632652, 965559588.61141193, 231, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.038536015414105979, 'yes']
['2848629285', 1.025974025974026, 8, 12, 10, 0, 29789.666666666668, 660547337.93055546, 862, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.030948705957407993, 'yes']
['2852143935', 0.5838926174496645, 19, 29, 15, 0, 36982.244897959186, 968455889.7359432, 656, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.044794401445004406, 'yes']
['2859266014', 3.997371879106439, 5, 2, 7, 0, 31524.387755102041, 558771531.66597247, 1122, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.046854202090115656, 'yes']
['2867958704', 0.2480974124809741, 0, 1, 0, 0, 37613.48936170213, 583600518.42009962, 1683, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023619474297455818, 'yes']
['2868206787', 0.028042063094641963, 12, 22, 4, 0, 22674.659090909092, 449754909.542871

In [69]:
with open('./feature.json', 'w') as myfile:
    json.dump(vectors, myfile)