In [1]:
import json
import unicodecsv as csv
import pymongo

In [2]:
# 把一個list內的所有 json 打平，讓所有 json 共享同一個 key，並將其 value 搜集至對應的 list 當中
def flat_json(json_list):
    """
    Flatten the json_list into a single dictionary. 

    :param json_list: a list consists of json obj queried from MongoDB.
    :return: a dictionary with key as provided json obj, and value is a list to concatenate value from each json obj.
    :rtype: dictionary
    """
    try:
        key_list = json_list[0].keys()
        result_dict = {key:[json_obj[key] for json_obj in json_list] for key in key_list }  # result_dict initialization
        return result_dict
    except:
        print "Error occured when flattening json_list."

In [3]:
# 匯入 csv 檔案
def csv_to_list(csv_file_path):
    """
    Input a csv file by reading a row as a dictionary.

    :param csv_file_path: the path to csv file.
    :return: a list consists of dictionaries, a dictionary represents a row in csv file.
    :rtype: list
    """
    with open(csv_file_path) as csvfile:
        reader = csv.DictReader(csvfile)
        post_data_list = [{k:int(v) if v.isdigit() else v for k,v in row.iteritems()} for row in reader]
        return post_data_list

In [4]:
# 匯出 csv 檔，供手動標記 label 使用，建議用Numbers開啟此檔案 
def list_to_csv(dict_list, csv_file_path):
    """
    Output a csv file for a list of dictionaries, the csv is for future manual labeling.

    :param dict_list: a list consists of ditionaries, each dictionary represents a data collection of a post.
    :param csv_file_path: path to csv file.
    """
    with open(csv_file_path, 'w') as f:  
        w = csv.DictWriter(f, dict_list[0].keys())
        w.writeheader()
        w.writerows(dict_list)

In [5]:
# db set-up
client = pymongo.MongoClient("localhost", 27017)
db = client.fanpage_research
db.name

NameError: name 'pymongo' is not defined

In [None]:
def get_valid_pid_list():
    """
    Get all post_id from MongoDB excluding the post which content is None and like_count is 0.

    :return: valid post for research.
    :rtype: list
    """
    valid_pid_list = []
    try:
        # get pid_list exclude message content is None
        p_cursor = list(db.post.aggregate([{"$match":{"message":{"$ne":None}}}]))
        pid_list_has_message = flat_json(p_cursor)['pid']

        # get pid_list exclude like_count is 0
        pd_cursor = list(db.post_detail.aggregate([
            {"$group":{"_id":"$pid","like_count_total":{"$sum":"$like_count"}}},
            {"$match":{"like_count_total":{"$ne":0}}}
        ]))
        pid_list_has_like = flat_json(pd_cursor)['_id']

        # get valid_pid_list where pid has message content and like_count is not 0
        valid_pid_list = list(set.intersection(set(pid_list_has_message),set(pid_list_has_like)))
    except:
        print "Error occured when getting valid_pid_list"
    return valid_pid_list