In [169]:
import json
import pandas as pd
import numpy  as np
import csv
import datetime
from urllib.request import urlopen

today = datetime.datetime.today()

# Show all data 
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# Read the information in 'committee-info.json' which contains information such as roste_count, description, established, roster, etc.
# The 'committee-info.json' was downloaded from 'https://whimsy.apache.org/public/'
# committees_json = open('committee-info.json').read()

# or get data directly from 'https://whimsy.apache.org/public/committee-info.json'
url = 'https://whimsy.apache.org/public/committee-info.json'
response = urlopen(url) 
committees_json = response.read()
committees_info = json.loads(committees_json)
committees = committees_info['committees']

# taglist
# 拿到_TreeviewTagDict然后查询
tag_json = open('../public/json/projects_total.json').read()
tag_dict = json.loads(tag_json)




Create a datafame, which contains the members' information for each committee: committee name, name, id, date

In [170]:
committer = pd.DataFrame()

for item in committees:
    committee_info = pd.DataFrame.from_dict(committees[item]['roster'],orient='index',columns=['name','date'])
    committee_info['committee'] = item
    committee_info['description'] = committees[item]['description']

    committer = pd.concat([committer,committee_info],axis = 0)

committer['date'] = pd.to_datetime(committer['date'])
committer['year'] = committer['date'].dt.year
committer['month'] = committer['date'].dt.month
committeeList = list(set(committer['committee']))



summarise the committee information, including the year the committee was created, description, name, total number of people to date, etc.

In [171]:
from pandas import NA

# Extracts the creation time of the committee  e.g. "12/2000, reestablished 10/2002" => 12/2000
def extractDate(x):
    if x:
        return x.replace(',',';').split(';')[0]
    else:
        return NA

committees = pd.DataFrame.from_dict(committees,orient='index',columns=[ "display_name","established","roster_count"])
committees['established'] = pd.to_datetime(committees['established'].map(extractDate))
committees['description'] = [committees_info['committees'][index]['description'] for index in committees.index]
committees.reset_index(inplace=True)
committees.rename(columns={'index':'committee_name'},inplace=True)

Process the data to graph the annual growth in committee size: yaxis, xaixs, value=[[x1,y1,size1],[x1,y2,size2]]

Since the total number of committees was too large for the graph to display, the data was filtered， removed the committees with the maximum annual growth of less than 15 from the display

In [172]:
# Merge the year and committee columns to create a new column for easy grouping of data


from cmath import log


def dataFilter(raw_df):
    
    committee_list = raw_df['committee']
    drop_list = []
    for committee_name in committee_list:
        sub_df = raw_df[raw_df['committee'] == committee_name] 
        if sub_df['size'].max() < 15:
            drop_list.append(committee_name)
            index = raw_df[raw_df['committee'] == committee_name].index[0]
            raw_df = raw_df.drop(index=index)
    return raw_df

def chartData(raw_df):

    yAxis_value = list(set(raw_df['committee']))
    xAxis_value = list(set(raw_df['year']))
    xAxis_value.sort()
    size_value = []
    for index, row in raw_df.iterrows():
        size_value.append([yAxis_value.index(row['committee']),xAxis_value.index(row['year']),row['size']])
    
    return {
        'yAxis':yAxis_value,
        'xAxis':xAxis_value,
        'size':size_value
    }

def getTag(str):
    if pd.isna(str) == False:
        try:
            return tag_dict[str]["tag"]
        except:
            return str
    else:
        return ''
    
# 获取tag
def addTag(raw_df):
    df = raw_df
    df['tag'] = df['committee'].apply(getTag)
    return df

def getData(raw_df):
    df = raw_df.groupby(['committee','year'])['name'].count().to_frame()
    df.reset_index(inplace=True) #turn the grouped indexes into columns
    df.rename(columns={'name':'size'},inplace=True)
    result_df = chartData(df)

    return result_df

# 按照tag进行分组，并且输出
def ScatterData(raw_df):
    df = addTag(raw_df)
    tag_list =list(set(','.join(list(df['tag'])).split(',')))
    tag_list.remove('')
    tag_list.remove('apache')
    result = {}
    for tag in tag_list:
        sub_df = df[ df['tag'].apply(lambda x: tag in x)]
        sub_dict = getData(sub_df)
        result[tag] = sub_dict
        result[tag]['num'] = len(sub_dict['yAxis'])
    res = sorted(result.items(),key=lambda x:x[1]['num'],reverse=True)
    scatter_dict = {}
    for r in res:
        scatter_dict[r[0]] = r[1]
    return scatter_dict
        
    
scatter = ScatterData(committer)


AttributeError: 'NoneType' object has no attribute 'remove'

Subchart: monthly growth graph for the selected committee 

In [None]:

def completeMonthColumn(df):
    # Complete the year-months from the date of creation to the present
    df['year_month'] = df['year'].map(str) +'-' +df['month'].map(str)
    result_dict = {}

    for committee in committeeList:
        sub_df = df[df['committee'] == committee]
        year_list = sub_df['year']
        date_list = list(sub_df['year_month'])
        date_list_new = [str(year) + '-' + str(month) for year in range(int(year_list.min()),today.year + 1) for month in range(1,13)]
        data_list_min = date_list[0]
        data_list_max = str(today.year) + '-' + str(today.month)
        date_list_new = date_list_new[date_list_new.index(data_list_min):date_list_new.index(data_list_max)+1]
        add_list =[]
        for date in date_list_new:
            if date in date_list:
                add_list.append(list(sub_df[sub_df['year_month'] == date]['add'])[0])
            else:
                add_list.append(0)
        result_dict[committee] = {
            'xAxis':date_list_new,
            'values':add_list,
            'description':committees_info['committees'][committee]["description"],
            'established':committees_info['committees'][committee]["established"]
        }
        

    return result_dict
            
def lineData(raw_df):
    df = raw_df.groupby(['committee','year','month'])['name'].count().to_frame()
    df.reset_index(inplace = True)
    df.rename(columns={'name':'add'},inplace=True)
    result_dict = completeMonthColumn(df)
    return result_dict
    

committee_detail_dict = lineData(committer)

store the data


In [None]:
committer_dic = {
    'scatter':scatter,
    "committee_detail":committee_detail_dict
}

file_name = '../public/json/committer.json'

with open(file_name,'w') as file_obj:
    json.dump(committer_dic,file_obj)