In [1]:
import os
import re
import json
import jieba
import numpy as np
from datetime import datetime

import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
stop_words = [word.strip() for word in open('stop_word.txt', 'r').readlines()]
stop_words += [' ']

In [3]:
def str2time(s):
    ans = None
    if s.startswith('今天'):
        ans = datetime.today()
        t = datetime.strptime(s.split(' ')[1], '%H:%M')
        ans = ans.replace(hour = t.hour, minute = t.minute)
        return ans
    try:
        ans = datetime.strptime(s, '%m月%d日 %H:%M')
        ans = ans.replace(year = datetime.today().year)
        return ans
    except:
        try:
            ans = datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
            return ans
        except:
            return None

In [4]:
def get_features(user):
    vector = {
        'reputation'            : 0,
        'mention'               : 0,
        'hashtag'               : 0,
        'url'                   : 0,
        'text_similarity'       : 0,
        'time_interval_mean'    : 0,
        'time_interval_var'     : 0,
        'post_num'              : 0,
        'active_day_ratio'      : 0,
        'follow_ratio'          : 0,
        'figure_RRT'            : 0,
        'average_comm'          : 0,
        'followee_num'          : 0,
        'figure_at_every'       : 0,
        'day_interval_variance' : 0,
        'late_night_times'      : 0,
        'figure_at_sigle'       : 0,
        'figure_at'             : 0
    }
    
    # 声望：关注他的人与他关注的人之比
    vector['reputation'] = float(user['followers_num']) / float(user['following_num'])
    
    
    # '@'次数和'#'次数：
    vector['mention'] = 0
    vector['hashtag'] = 0
    vector['url']     = 0
    posts   = []
    time_intervals    = []
    prev_time         = None
    
    
    for post in user['weibo']:
        if '@' in post['content']:
            vector['mention'] += 1
        if '#' in post['content']:
            vector['hashtag'] += 1
        if 'http://' in post['content']:
            vector['url'] += 1
        words = list(jieba.cut(post['content']))
        words = [word for word in words if word not in stop_words]
        posts.append(' '.join(words))
        
        curr_time = str2time(' '.join(post['time'].split()[:2]))
        if curr_time == None:
            continue
        if prev_time != None:
            time_intervals.append((prev_time - curr_time).seconds)
        prev_time = curr_time
    
    
    if len(time_intervals) > 0:
        vector['time_interval_mean'] = np.mean(time_intervals)
        vector['time_interval_var'] = np.var(time_intervals)
    
    vector['post_num'] = int(user['weibos_num'])
    

    
    # posts之间的文本相似度
    try:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(posts))
        matrix = (tfidf * tfidf.T).A
        vector['test_similarity'] = np.mean(matrix)
    except:
        print(user)
    
    return vector

In [9]:
def get_label(text, id):
    p = re.compile('%s.+?(yes|no)' % id)
    return re.findall(p, text)[0]
    

path = './spider/weibo/'
files = os.listdir(path)
text = open('./data/spammer_order.csv', 'r').read()

vectors = []
for file in files:
    user = json.load(open(os.path.join(path, file), 'r'))
    if len(user['weibo']) <= 0 or int(user['weibos_num']) <= 0:
        continue
    id = file.split('.')[0]
    if (id=='1214054254'):
        print(list(get_features(user).values()))
        print(get_features(user))
    else:
        continue
    label = get_label(text, id)
    vector = [id] + list(get_features(user).values()) + [label]
    vectors.append(vector)

[0, 0, 0, 15, 0.5, 0, 0, 0, 1005, 0, 0, 7, 0, 0, 0.034592289736287828, 0, 1, 30972.857142857141, 823993452.04081619]
{'figure_RRT': 0, 'average_comm': 0, 'figure_at_sigle': 0, 'hashtag': 15, 'reputation': 0.5, 'late_night_times': 0, 'followee_num': 0, 'follow_ratio': 0, 'post_num': 1005, 'figure_at': 0, 'active_day_ratio': 0, 'mention': 7, 'day_interval_variance': 0, 'figure_at_every': 0, 'test_similarity': 0.034592289736287828, 'text_similarity': 0, 'url': 1, 'time_interval_mean': 30972.857142857141, 'time_interval_var': 823993452.04081619}


In [6]:
for v in vectors:
    print(v)

['1214054254', 0, 0, 0, 15, 0.5, 0, 0, 0, 1005, 0, 0, 7, 0, 0, 0.034592289736287828, 0, 1, 30972.857142857141, 823993452.04081619, 'no']
['2158838150', 0, 0, 0, 10, 1.7333333333333334, 0, 0, 0, 1819, 0, 0, 3, 0, 0, 0.026581635586285113, 0, 0, 33023.0, 749124164.25, 'no']
['1465038162', 0, 0, 0, 18, 0.9846153846153847, 0, 0, 0, 3542, 0, 0, 3, 0, 0, 0.027306004520238628, 0, 2, 36454.25, 622968736.0625, 'no']
['5203407567', 0, 0, 0, 27, 0.012919896640826873, 0, 0, 0, 340, 0, 0, 19, 0, 0, 0.031989037252186728, 0, 4, 11511.34693877551, 710274808.75718451, 'no']
['3944654540', 0, 0, 0, 7, 0.2043956043956044, 0, 0, 0, 371, 0, 0, 3, 0, 0, 0.055029804700320109, 0, 38, 26378.770833333332, 799514412.17664945, 'yes']
['5055635073', 0, 0, 0, 0, 0.13675675675675675, 0, 0, 0, 467, 0, 0, 1, 0, 0, 0.030800458185710273, 0, 0, 37345.510204081635, 116925099.92336527, 'yes']
['5037076803', 0, 0, 0, 6, 0.49166666666666664, 0, 0, 0, 620, 0, 0, 2, 0, 0, 0.051178157824589239, 0, 2, 18005.39534883721, 107260266

In [7]:
with open('./feature.json', 'w') as myfile:
    json.dump(vectors, myfile)