In [1]:
import os
import re
import json
import jieba
import numpy as np
from datetime import datetime

import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
stop_words = [word.strip() for word in open('stop_word.txt', 'r').readlines()]
stop_words += [' ']

In [3]:
def str2time(s):
    ans = None
    if s.startswith('今天'):
        ans = datetime.today()
        t = datetime.strptime(s.split(' ')[1], '%H:%M')
        ans = ans.replace(hour = t.hour, minute = t.minute)
        return ans
    try:
        ans = datetime.strptime(s, '%m月%d日 %H:%M')
        ans = ans.replace(year = datetime.today().year)
        return ans
    except:
        try:
            ans = datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
            return ans
        except:
            return None

In [4]:
def get_features(user):
    vector = {
        'reputation'            : 0,
        'mention'               : 0,
        'hashtag'               : 0,
        'url'                   : 0,
        'text_similarity'       : 0,
        'time_interval_mean'    : 0,
        'time_interval_var'     : 0,
        'post_num'              : 0,
#         'active_day_ratio'      : 0,
#         'follow_ratio'          : 0,
#         'figure_RRT'            : 0,
#         'average_comm'          : 0,
#         'followee_num'          : 0,
#         'figure_at_every'       : 0,
#         'day_interval_variance' : 0,
#         'late_night_times'      : 0,
#         'figure_at_sigle'       : 0,
#         'figure_at'             : 0,
        
        'followers_num'         : 0, # new added
        'following_num'         : 0,
        'content_length'        : 0
    }
    
    # 声望：关注他的人与他关注的人之比
    vector['reputation'] = float(user['followers_num']) / float(user['following_num'])
    vector['followers_num'] = float(user['followers_num'])
    vector['following_num'] = float(user['following_num'])
    
    # '@'次数和'#'次数：
    vector['mention'] = 0
    vector['hashtag'] = 0
    vector['url']     = 0
    posts   = []
    time_intervals    = []
    prev_time         = None
    post_lens = []
    
    for post in user['weibo']:
        if '@' in post['content']:
            vector['mention'] += 1
        if '#' in post['content']:
            vector['hashtag'] += 1
        if 'http://' in post['content']:
            vector['url'] += 1
        words = list(jieba.cut(post['content']))
        words = [word for word in words if word not in stop_words]
        posts.append(' '.join(words))
        
        curr_time = str2time(' '.join(post['time'].split()[:2]))
        if curr_time == None:
            continue
        if prev_time != None:
            time_intervals.append((prev_time - curr_time).seconds)
        prev_time = curr_time
        
        post_lens.append(len(post['content']))
    
    
    if len(time_intervals) > 0:
        vector['time_interval_mean'] = np.mean(time_intervals)
        vector['time_interval_var'] = np.var(time_intervals)
        vector['content_length'] = np.mean(post_lens)
    
    vector['post_num'] = int(user['weibos_num'])
    

    
    # posts之间的文本相似度
    try:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(posts))
        matrix = (tfidf * tfidf.T).A
        vector['text_similarity'] = np.mean(matrix)
    except:
        print(user)
    
    return vector

In [5]:
def get_label(text, id):
    p = re.compile('%s.+?(yes|no)' % id)
    return re.findall(p, text)[0]
    

path = './spider/weibo/'
files = os.listdir(path)
text = open('./data/spammer_order.csv', 'r').read()

vectors = []
for file in files:
    user = json.load(open(os.path.join(path, file), 'r'))
    if len(user['weibo']) <= 0 or int(user['weibos_num']) <= 0:
        continue
    id = file.split('.')[0]
    label = get_label(text, id)
#     if id == '1005383941':
#         print(get_features(user))
# #         print(list(get_features(user).values()))
#     elif id == '1006765334':
#         print(get_features(user))
#     else:
#         continue
    vector = get_features(user)
    vector['id'] = id
    vector['is_spammer'] = label
    vectors.append(vector)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.116 seconds.
Prefix dict has been built succesfully.


In [6]:
for v in vectors:
    print(v)

{'reputation': 0.5, 'text_similarity': 0.034592289736287828, 'time_interval_var': 823993452.04081619, 'following_num': 358.0, 'content_length': 60.479999999999997, 'id': '1214054254', 'post_num': 1005, 'mention': 7, 'url': 1, 'time_interval_mean': 30972.857142857141, 'hashtag': 15, 'is_spammer': 'no', 'followers_num': 179.0}
{'reputation': 1.7333333333333334, 'text_similarity': 0.026581635586285113, 'time_interval_var': 749124164.25, 'following_num': 180.0, 'content_length': 48.510204081632651, 'id': '2158838150', 'post_num': 1819, 'mention': 3, 'url': 0, 'time_interval_mean': 33023.0, 'hashtag': 10, 'is_spammer': 'no', 'followers_num': 312.0}
{'reputation': 0.9846153846153847, 'text_similarity': 0.027306004520238628, 'time_interval_var': 622949079.81206596, 'following_num': 715.0, 'content_length': 59.979591836734691, 'id': '1465038162', 'post_num': 3542, 'mention': 3, 'url': 2, 'time_interval_mean': 36454.020833333336, 'hashtag': 18, 'is_spammer': 'no', 'followers_num': 704.0}
{'repu

In [7]:
with open('./feature.json', 'w') as myfile:
    json.dump(vectors, myfile)