In [1]:
import os
import re
import json
import jieba
import numpy as np
from datetime import datetime

import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
stop_words = [word.strip() for word in open('stop_word.txt', 'r').readlines()]
stop_words += [' ']

In [3]:
def str2time(s):
    ans = None
    if s.startswith('今天'):
        ans = datetime.today()
        t = datetime.strptime(s.split(' ')[1], '%H:%M')
        ans = ans.replace(hour = t.hour, minute = t.minute)
        return ans
    try:
        ans = datetime.strptime(s, '%m月%d日 %H:%M')
        ans = ans.replace(year = datetime.today().year)
        return ans
    except:
        try:
            ans = datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
            return ans
        except:
            return None

In [4]:
def get_features(user):
    vector = {
        'reputation'            : 0,
        'mention'               : 0,
        'hashtag'               : 0,
        'url'                   : 0,
        'text_similarity'       : 0,
        'time_interval_mean'    : 0,
        'time_interval_var'     : 0,
        'post_num'              : 0,
        'active_day_ratio'      : 0,
        'follow_ratio'          : 0,
        'figure_RRT'            : 0,
        'average_comm'          : 0,
        'followee_num'          : 0,
        'figure_at_every'       : 0,
        'day_interval_variance' : 0,
        'late_night_times'      : 0,
        'figure_at_sigle'       : 0,
        'figure_at'             : 0,
        
        'followers_num'         : 0, # new added
        'following_num'         : 0,
        'content_length'        : 0
    }
    
    # 声望：关注他的人与他关注的人之比
    vector['reputation'] = float(user['followers_num']) / float(user['following_num'])
    vector['followers_num'] = float(user['followers_num'])
    vector['following_num'] = float(user['following_num'])
    
    # '@'次数和'#'次数：
    vector['mention'] = 0
    vector['hashtag'] = 0
    vector['url']     = 0
    posts   = []
    time_intervals    = []
    prev_time         = None
    post_lens = []
    
    for post in user['weibo']:
        if '@' in post['content']:
            vector['mention'] += 1
        if '#' in post['content']:
            vector['hashtag'] += 1
        if 'http://' in post['content']:
            vector['url'] += 1
        words = list(jieba.cut(post['content']))
        words = [word for word in words if word not in stop_words]
        posts.append(' '.join(words))
        
        curr_time = str2time(' '.join(post['time'].split()[:2]))
        if curr_time == None:
            continue
        if prev_time != None:
            time_intervals.append((prev_time - curr_time).seconds)
        prev_time = curr_time
        
        post_lens.append(len(post['content']))
    
    
    if len(time_intervals) > 0:
        vector['time_interval_mean'] = np.mean(time_intervals)
        vector['time_interval_var'] = np.var(time_intervals)
        vector['content_length'] = np.mean(post_lens)
    
    vector['post_num'] = int(user['weibos_num'])
    

    
    # posts之间的文本相似度
    try:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(posts))
        matrix = (tfidf * tfidf.T).A
        vector['test_similarity'] = np.mean(matrix)
    except:
        print(user)
    
    return vector

In [19]:
def get_label(text, id):
    p = re.compile('%s.+?(yes|no)' % id)
    return re.findall(p, text)[0]
    

path = './spider/weibo/'
files = os.listdir(path)
text = open('./data/spammer_order.csv', 'r').read()

vectors = []
for file in files:
    user = json.load(open(os.path.join(path, file), 'r'))
    if len(user['weibo']) <= 0 or int(user['weibos_num']) <= 0:
        continue
    id = file.split('.')[0]
    label = get_label(text, id)
    if id == '1005383941':
        print(get_features(user))
#         print(list(get_features(user).values()))
    elif id == '1006765334':
        print(get_features(user))
    else:
        continue
    vector = [id] + list(get_features(user).values()) + [label]    
    vectors.append(vector)

{'content_length': 96.0625, 'hashtag': 0, 'late_night_times': 0, 'figure_at': 0, 'reputation': 0.13184584178498987, 'followee_num': 0, 'followers_num': 65.0, 'figure_at_every': 0, 'following_num': 493.0, 'figure_at_sigle': 0, 'active_day_ratio': 0, 'url': 3, 'time_interval_mean': 5383.9787234042551, 'test_similarity': 0.029655969427463253, 'figure_RRT': 0, 'follow_ratio': 0, 'day_interval_variance': 0, 'text_similarity': 0, 'time_interval_var': 163474077.04210049, 'average_comm': 0, 'post_num': 536, 'mention': 2}
{'content_length': 102.14, 'hashtag': 4, 'late_night_times': 0, 'figure_at': 0, 'reputation': 0.5829787234042553, 'followee_num': 0, 'followers_num': 137.0, 'figure_at_every': 0, 'following_num': 235.0, 'figure_at_sigle': 0, 'active_day_ratio': 0, 'url': 6, 'time_interval_mean': 36113.897959183676, 'test_similarity': 0.036212120364095941, 'figure_RRT': 0, 'follow_ratio': 0, 'day_interval_variance': 0, 'text_similarity': 0, 'time_interval_var': 810268093.11203671, 'average_comm

In [14]:
for v in vectors:
    print(v)

['1214054254', 60.479999999999997, 15, 0, 0, 0.5, 0, 179.0, 0, 358.0, 0, 0, 1, 30972.857142857141, 0.034592289736287828, 0, 0, 0, 0, 823993452.04081619, 0, 1005, 7, 'no']
['2158838150', 48.510204081632651, 10, 0, 0, 1.7333333333333334, 0, 312.0, 0, 180.0, 0, 0, 0, 33023.0, 0.026581635586285113, 0, 0, 0, 0, 749124164.25, 0, 1819, 3, 'no']
['1465038162', 59.979591836734691, 18, 0, 0, 0.9846153846153847, 0, 704.0, 0, 715.0, 0, 0, 2, 36453.708333333336, 0.027306004520238628, 0, 0, 0, 0, 622922283.78993046, 0, 3542, 3, 'no']
['5203407567', 102.66, 27, 0, 0, 0.012919896640826873, 0, 5.0, 0, 387.0, 0, 0, 4, 11511.34693877551, 0.031989037252186728, 0, 0, 0, 0, 710274808.75718451, 0, 340, 19, 'no']
['3944654540', 54.918367346938773, 7, 0, 0, 0.2043956043956044, 0, 93.0, 0, 455.0, 0, 0, 38, 26378.770833333332, 0.055029804700320109, 0, 0, 0, 0, 799514412.17664945, 0, 371, 3, 'yes']
['5055635073', 50.799999999999997, 0, 0, 0, 0.13675675675675675, 0, 253.0, 0, 1850.0, 0, 0, 0, 37345.510204081635, 0

In [15]:
with open('./feature.json', 'w') as myfile:
    json.dump(vectors, myfile)