In [1]:
import json
import pymongo

In [2]:
import jieba
import jieba.analyse

In [3]:
import math
import numpy as np
from collections import OrderedDict

In [4]:
with open('fid_list_4.json', 'r') as readfile:
    fp_file = json.load(readfile)

In [5]:
client = pymongo.MongoClient("localhost", 27017)
db = client.fanpage_research
db.name

u'fanpage_research'

In [6]:
jieba.set_dictionary("dict.txt.big")
jieba.analyse.set_stop_words("data/household_stop_words.txt")

In [7]:
class TextMining:
    def __init__(self,keyword_dict_path, all_post_list):
        self.keyword_dict_path = keyword_dict_path
        self.all_post_list = all_post_list
        self.keyword_dict = self.get_keyword_dict()
        self.idf_list = self.get_idf_list()
    def get_keyword_dict(self):
        with open(self.keyword_dict_path,'r') as infile:
            keyword_dict = json.load(infile, object_pairs_hook=OrderedDict)
        return keyword_dict
    def make_tf_vector(self,msg,print_out=False):
        tf_vector = []
        jieba_list = list(jieba.cut(msg))
        for typology in self.keyword_dict:
            word_count_list = [jieba_list.count(word) for word in self.keyword_dict[typology]]
            tf_vector.append(sum(word_count_list))
            if print_out :
                match_word = ""
                for idx, word_count in enumerate(word_count_list):
                    if word_count > 0:
                        match_word+=self.keyword_dict[typology][idx]+' count '+str(word_count)+','
                print "%s (term count :%d)  %s" % (typology,sum(word_count_list),match_word)
        return tf_vector
    def get_idf_list(self,refresh=False):
        tf_vector_list = []
        for post in self.all_post_list :
            tf_vector_list.append(self.make_tf_vector(post))           
        feature_vector_list = np.array(tf_vector_list)
        num_total_document = feature_vector_list.shape[0]
        num_term = feature_vector_list.shape[1]
        num_documentIncludeTerm_list = []
        for idx in range(num_term):
            termCount_document = [v[idx] for v in feature_vector_list]
            num_documentIncludeTerm_list.append(num_total_document-termCount_document.count(0))
        idf_list = [math.log1p(num_total_document/f) for f in num_documentIncludeTerm_list]
        return idf_list
    def make_tfidf_vector(self,msg,print_out=False):
        tfidf_vector = []
        tf_list = self.make_tf_vector(msg,print_out)
        for (a,b) in zip(tf_list, self.idf_list):
            tfidf_vector.append(a*b)
        return tfidf_vector

In [8]:
def tfidf_predict(feature_vector,print_out=False):
    prediction = []
    class_dict = {
        "class_0" : [11,12],
        "class_1" : [3,4,5,6],
        "class_2" : [7,8,9,10,13],
        "class_3" : [0,1,2]}
    max_class_sum = float("-inf")
    max_class = None
    for i in range(len(class_dict)):
        class_sum = sum(feature_vector[idx] for idx in class_dict["class_"+str(i)])
        if print_out==True:
            print 'class %s : %f' % (i,class_sum)
        if class_sum >= max_class_sum:
            max_class_sum = class_sum
            max_class = i
    return max_class

In [12]:
class Brand:
    
    def __init__(self,fid):
        self.fid = fid
        self.start_fan_count, self.stop_fan_count = self.get_fan_count()
        self.post_count,self.pid_list,self.msg_list = self.get_post_info()
        self.certificate = self.get_certificate_status()
        self.like_count_list, self.comment_count_list, self.share_count_list, self.tag_count_list = self.get_post_detail()
        self.valid_post_count, self.valid_pid_list, self.valid_msg_list = self.get_valid_post()
        self.promo_pid_list, self.engage_pid_list, self.product_pid_list, self.brand_pid_list = self.classify_post()
    def get_fan_count(self):
        """
        Get fan count from mongoDB.
        
        :return: fan count in the inital data, fan count in the last data
        :rtype: int, int
        """
        try:
            cursor = db.fanpage.find({"fid":self.fid})
            fan_count_list = [document for document in cursor]
            if fan_count_list[0]['fan_count']>0 and fan_count_list[-1]['fan_count']>0:
                return fan_count_list[0]['fan_count'], fan_count_list[-1]['fan_count']
            else:
                print "Get fan_count is zero, pls check"
                return
        except:
            print "fail to get fanpage information from mongoDB %s" % self.fid
    
    def get_post_info(self):
        """
        Get post count and all pid list from mongoDB exclude posts with "null" message.
        
        :return: count of total posts, all pid
        :rtype: int, list
        """
        try:
            pid_list = []
            msg_list = []
            cursor = db.post.aggregate([{"$match":{'fid':self.fid, "message":{"$ne":None}}}])
            for document in cursor :     
                pid_list.append(document['pid'])
                msg_list.append(document['message'])
            return len(pid_list), pid_list, msg_list
        except:
            print "fail to get post information from mongoDB %s" % self.fid
    
    def get_certificate_status(self):
        """
        Get certificate status from outside file :'fid_list_4.json'.
        
        :return: The certificate status on facebook
        :rtype: boolean
        """
        try:
            for fp in fp_file:
                if self.fid == fp['fid']:
                    return False if fp['fb_cert']=='NA' else True
        except:
            print "fail to get certificate status from mongoDB %s" % self.fid

    def get_post_detail(self):
        """
        Get post details from mongoDB.
        
        :return: like count list, comment count list, share count list, friend tagging count list
        :rtype: list, list, list, list
        """
        like_count_list = []
        comment_count_list = []
        share_count_list = []
        tag_count_list = []
        for pid in self.pid_list:
            try:
                cursor = db.post_detail.aggregate([{"$match":{"pid":pid}},
                                                   {"$group":{"_id":"$pid",
                                                              "maxLike":{"$max":"$like_count"},
                                                              "maxComment":{"$max":"$comment_count"},
                                                              "maxShare":{"$max":"$share_count"},
                                                              "maxTag":{"$max":"$friend_tagg_count"}}}])
                cursor_result = [document for document in cursor]
                like_count_list.append(cursor_result[0]['maxLike'])
                comment_count_list.append(cursor_result[0]['maxComment'])
                share_count_list.append(cursor_result[0]['maxShare'])
                tag_count_list.append(cursor_result[0]['maxTag'])
            except:
                print "fail to get post detail from mongoDB %s" % pid
        return like_count_list,comment_count_list,share_count_list,tag_count_list
    
    def get_valid_post(self):
        valid_pid_list = []
        valid_msg_list = []
        for like_count, pid, msg in zip(self.like_count_list,self.pid_list,self.msg_list):
            valid_pid_list+=[pid] if like_count > 0 else []
            valid_msg_list+=[msg] if like_count > 0 else []
        return len(valid_pid_list), valid_pid_list, valid_msg_list
           
    def classify_post(self):
        class_result = []
        tm = TextMining("data/keyword_all_2.json",self.valid_msg_list)
        for pid, msg in zip(self.valid_pid_list, self.valid_msg_list):
            class_result.append(tfidf_predict(tm.make_tfidf_vector(msg)))
        promo_pid_list = np.array(self.valid_pid_list)[np.array(class_result)==0]
        engage_pid_iist = np.array(self.valid_pid_list)[np.array(class_result)==1]
        product_pid_list = np.array(self.valid_pid_list)[np.array(class_result)==2]
        brand_pid_list = np.array(self.valid_pid_list)[np.array(class_result)==3]
        return promo_pid_list, engage_pid_iist, product_pid_list, brand_pid_list
    


In [13]:
test_brand = Brand("601530866559001")

In [99]:
print test_brand.start_fan_count, test_brand.stop_fan_count

55016 55600


In [112]:
print "like_count for each post :", test_brand.like_count_list
print "comment_count for each post :", test_brand.comment_count_list
print "share_count for each post :", test_brand.share_count_list
print "friendTagging_count for each post :", test_brand.tag_count_list 

like_count for each post : [18, 281, 16, 16, 12, 21, 16, 20, 25, 33, 14, 23, 17, 18, 21, 15, 16, 19, 19, 12, 18, 10, 7, 8, 2766, 6, 8, 21, 582, 11, 8639, 9, 19, 20, 14, 11, 27, 20, 83, 16, 26, 11, 11, 14, 0, 0, 0, 0, 10, 13, 9, 395, 0, 0, 0, 0, 0]
comment_count for each post : [3, 270, 3, 3, 2, 3, 7, 3, 3, 3, 2, 5, 5, 3, 5, 3, 3, 3, 3, 3, 3, 3, 2, 2, 9, 2, 3, 2, 568, 2, 127, 2, 4, 3, 3, 2, 3, 6, 63, 2, 2, 2, 2, 3, 0, 0, 0, 0, 2, 3, 2, 375, 0, 0, 0, 0, 0]
share_count for each post : [0, 213, 1, 2, 2, 0, 0, 1, 4, 2, 1, 1, 2, 2, 1, 0, 1, 4, 2, 2, 2, 1, 0, 0, 11, 2, 2, 3, 541, 5, 135, 3, 1, 2, 3, 1, 3, 3, 59, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 6, 1, 225, 0, 0, 0, 0, 0]
friendTagging_count for each post : [0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 230, 0, 37, 0, 0, 0, 0, 0, 0, 0, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 293, 0, 0, 0, 0, 0]
