# 数据获取

## 本小组数据获取由基于api封装的PyGithub库和原生api共同完成

In [1]:
import time
import json
import github
import requests
import pandas as pd
from github import Github
from collections import OrderedDict
from datetime import datetime, timedelta
# 为了安全性考虑，这里不贴出个人token，如有需要重跑代码，辛苦助教自己生成
token = '' 
header = {
    'Accept': 'application/vnd.github.v3+json',
    'Authorization': 'token xxx'
}

### 工具函数

In [2]:
# 用于任务1,2,3
# 工具函数（累加）
def sum_list(result):
    temp = result.copy()
    for i in range(1, len(result)):
        temp[i] = temp[i - 1] + temp[i]
    return temp

In [3]:
# 用于任务1,2,3,4
# 工具函数（生成时间线）
def create_assist_date(end=time.strftime('%Y.%m.%d')):
    start = datetime.strptime('2015.11.7', '%Y.%m.%d')
    end = datetime.strptime(end, '%Y.%m.%d')
    return list(OrderedDict(((start + timedelta(_)).strftime("%Y.%m"), None) for _ in range((end - start).days)).keys())

In [4]:
# 将数据写入Excel文件，方便后续处理
def write_to_file(value_list, data_axis): 
    # 用于任务1,2,3
    # result_sum = sum_list(value_list)
    # df = pd.DataFrame({'Timeline': data_axis, 'Issue_closed': value_list, 'Total_Issues_closed': result_sum})
    # 用于任务4,5,6,7
    df = pd.DataFrame({'Issue_Id': data_axis, 'Time_Used': value_list})
    filename = './data/tensorflow_pr_merged_time.xlsx'
    df.to_excel(filename, index=False)

In [5]:
# 用于任务7
# Github库缺少访问指定issue的comment功能，通过comments_url实现
def get_issue_comments(requester, comments_url):
    return github.PaginatedList.PaginatedList(github.IssueComment.IssueComment, requester, comments_url, dict())

In [6]:
# 用于任务4, 获取相关类型数据
# type指定查询类型——fork、pr、star等
def get_data(repo, type):
    if type == "fork":
        contents = repo.get_forks()
    elif type == "pr":
        # contents = repo.get_pulls(state='all')
        contents = repo.get_pulls(state='closed')
    elif type == "commit":
        contents = repo.get_commits()
    elif type == "issue_comment":
        contents = repo.get_issues_comments()
    elif type == "star":
        contents = repo.get_stargazers_with_dates()
    elif type == "pr_comment":
        contents = repo.get_pulls_comments()
    elif type == "commit_comment":
        contents = repo.get_comments()
    elif type == "issue":
        # contents = repo.get_issues(state='all')
        contents = repo.get_issues(state='closed')
    return contents

In [7]:
# 用于任务4, 返回创建者和日期
# type指定查询类型——fork、pr、star等
def handle_meta(type, content):
    if type == 'commit':
        return content.author, content.commit.committer.date.date().strftime('%Y.%m')
    elif type == 'fork':
        return content.owner, content.created_at.date().strftime('%Y.%m')
    elif type == 'star':
        return content.user, content.starred_at.date().strftime('%Y.%m')
    else:
        return content.user, content.created_at.date().strftime('%Y.%m')

### 具体流程

In [None]:
# 用于任务1 api在某些地方代替了PyGithub库主要是因为库为了普遍适用性，内部请求api次数过多，造成性能过慢以及api访问次数限制
data_axis = create_assist_date()
type_timeline = {}.fromkeys(data_axis, 0)
with open("contributors.json", 'r', encoding='UTF-8') as f: # 断点续存
    type_timeline = json.loads(f.read())
f.close()
for i in range(1, 857): # 1658 数据通过api返回的header里面的last获得
    response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/forks?per_page=100&page=" + str(i),
                            headers=header) # api由文档获取，per_page表示一页结果数量，page表示当前第几页
#     response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/stargazers?per_page=100&page=" + str(i),
#                             headers=header)
    for content in json.loads(response.content):
        type_timeline[datetime.strptime(content['created_at'].split('T')[0], '%Y-%m-%d').strftime("%Y.%m")] += 1 # starred_at
    with open('contributors.json', 'w') as file: # api非常不稳定，需要实时保存结果
        json.dump(type_timeline, file)
    print(i)
write_to_file(list(type_timeline.values()), data_axis)

In [None]:
# 用于任务2
data_axis = create_assist_date()
type_timeline = {}.fromkeys(data_axis, 0)
for i in range(1, 539):
    response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/issues?per_page=100&state=closed&page=" + str(i),
                            headers=header) # state=all
    for content in json.loads(response.content):
        if 'pull_request' not in content: # 查看该issue是不是pull request
            type_timeline[datetime.strptime(content['closed_at'].split('T')[0], '%Y-%m-%d').strftime("%Y.%m")] += 1
write_to_file(list(type_timeline.values()), data_axis)

In [None]:
# 用于任务3
data_axis = create_assist_date()
type_timeline = {}.fromkeys(data_axis, 0)
for i in range(1, 211):
    response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/pulls?per_page=100&state=closed&page=" + str(i),
                            headers=header) # state=all
    for content in json.loads(response.content):
        if content['merged_at']:
            type_timeline[datetime.strptime(content['merged_at'].split('T')[0], '%Y-%m-%d').strftime("%Y.%m")] += 1
write_to_file(list(type_timeline.values()), data_axis)

In [None]:
# 用于任务4
repo = Github(token).get_repo('tensorflow/tensorflow')
data_axis = create_assist_date()

    '''
    ForkEvent IssueCommentEvent  PullRequestReviewCommentEvent WatchEvent CommitCommentEvent
    IssuesEvent PullRequestEvent
    CreateEventPushEvent MemberEvent DeleteEvent 
    '''

type_list = ['fork', 'issue', 'issue_comment', 'pr_comment', 'star', 'commit', 'commit_comment']
contributors = {}
for data in data_axis:
    contributors[data] = []
with open("contributors.json", 'r', encoding='UTF-8') as f: # 断点续存
    contributors = json.loads(f.read())
f.close()
type_timeline = {}.fromkeys(data_axis, 0)
for type_now in type_list: # 请求所有类型的event
    for content in get_data(repo, type_now):
        user, date = handle_meta(type_now, content)
        # 查看这个月贡献者是否已包含此人
        if user and user.id not in contributors[date]:
            # 不包含加入列表
            contributors[date].append(user.id)
            type_timeline[date] += 1
        # 断点续存
        with open('contributors.json', 'w') as file:
            json.dump(contributors, file)
# print(contributors)
write_to_file(list(type_timeline.values()), data_axis)

In [None]:
# 用于任务5
issue_id = []
time_list = []
for i in range(1, 539):
    response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/issues?per_page=100&state=closed&page=" + str(i),
                            headers=header)
    for content in json.loads(response.content):
        if 'pull_request' not in content:
            issue_id.append(content['number'])
            time_list.append((datetime.strptime(content['closed_at'], '%Y-%m-%dT%H:%M:%SZ') -
                              datetime.strptime(content['created_at'], '%Y-%m-%dT%H:%M:%SZ')).days)

In [None]:
# 用于任务6
issue_id = []
time_list = []
# list形式的断点续存
df = pd.read_excel('./data/tensorflow_pr_merged_time.xlsx')
issue_id = df['Pr_id'].tolist()
time_list = df['Time_used'].tolist()
for i in range(1, 211):
    response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/pulls?per_page=100&state=closed&page=" + str(i),
                            headers=header)
    for content in json.loads(response.content):
        if content['merged_at']:
            issue_id.append(content['number'])
            time_list.append((datetime.strptime(content['merged_at'], '%Y-%m-%dT%H:%M:%SZ') -
                              datetime.strptime(content['created_at'], '%Y-%m-%dT%H:%M:%SZ')).days)
    # list形式的断点续存
    df = pd.DataFrame({'Pr_id': issue_id, 'Time_used': time_list})
    filename = './data/tensorflow_pr_merged_time.xlsx'
    df.to_excel(filename, index=False)
    print(i)

In [None]:
# 用于任务7
df = pd.read_excel('./data/tensorflow_replied_time.xlsx')
issue_id = df['Issue_id'].tolist()
time_list = df['Time_used'].tolist()
for i in range(1, 563):
    response = requests.get("https://api.github.com/repos/tensorflow/tensorflow/issues?per_page=100&state=all&page=" + str(i),
                            headers=header)
    for content in json.loads(response.content):
        if content['comments']:
            owner_id = content['user']['id']
            min_time = 999999
            create_time = datetime.strptime(content['created_at'], '%Y-%m-%dT%H:%M:%SZ').date()
            issue_contents = get_issue_comments(
                Github(token, per_page=100).get_repo('tensorflow/tensorflow')._requester, content['comments_url'])
            for issue_content in issue_contents:
                # 排除自己的回复并取最小值
                if owner_id != issue_content.user.id:
                    min_time = min((issue_content.created_at.date() - create_time).days, min_time)
            issue_id.append(content['number'])
            time_list.append(min_time)
    df = pd.DataFrame({'Issue_id': issue_id, 'Time_used': time_list})
    filename = './data/tensorflow_replied_time.xlsx'
    df.to_excel(filename, index=False)
    print(i)

# 数据分析

In [8]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
data_path = "./data"

In [10]:
fork_data_path = "./data/tensorflow_fork.xlsx"
fork_df = pd.read_excel(fork_data_path)
fork_df.head()

Unnamed: 0,Timeline,Fork
0,2015-11-01,3628
1,2015-12-01,847
2,2016-01-01,1237
3,2016-02-01,997
4,2016-03-01,1042


In [11]:
from pyecharts import options as opts
from pyecharts.charts import Bar,Line,Pie,WordCloud,EffectScatter,Boxplot
from pyecharts.charts import PictorialBar
from pyecharts.globals import SymbolType
from pyecharts.globals import ThemeType

### 每月新增 Star 和 Frok 的个数

In [12]:
x_label = []
timeline = fork_df['Timeline'].tolist()[:74]
for i in range(len(timeline)):
    x_label.append(timeline[i].strftime("%Y-%m"))
fork_data = np.array(fork_df['Fork']).tolist()

In [13]:
star_df = pd.read_excel("./data/tensorflow_star.xlsx")
star_data = np.array(star_df['Star']).tolist()

In [14]:
def draw(x_label, y1_data, y2_data, y1_name, y2_name, title):
    c = (
    Line(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(x_label)
    .add_yaxis(
        y1_name,
        is_smooth=True, 
        symbol="emptyCircle", 
        is_symbol_show=False,
        color="#6e9ef1",
        y_axis=y1_data,
        label_opts=opts.LabelOpts(is_show=False),
    ) 
    .add_yaxis(
        y2_name,
        is_smooth=True, 
        symbol="emptyCircle", 
        is_symbol_show=False,
        color="#d14a61",
        y_axis=y2_data,
        label_opts=opts.LabelOpts(is_show=False),
    ) 
    .set_series_opts(
        label_opts=opts.LabelOpts(
            is_show=False,
            position="inside",
            formatter="{c}",
        )
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title=title),
        tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "cross",trigger="axis"),
        xaxis_opts=opts.AxisOpts(
            name_rotate=60,axislabel_opts={"rotate":45},
            axispointer_opts=opts.AxisPointerOpts(
                is_show=True,
            ),  
        ),
    )
)

    return c

In [26]:
draw(x_label, star_data, fork_data, "新增Star", "新增fork", "每月新增Star & Fork 的个数" ).render_notebook()

In [16]:
issue_opened_df = pd.read_excel("./data/tensorflow_issue.xlsx")
issue_closed_df = pd.read_excel("./data/tensorflow_issue_closed.xlsx")
issue_opened_data = issue_opened_df['Issue'].tolist()
issue_closed_data = issue_closed_df['Issue_closed'].tolist()

### 每月打开 Issue 和 关闭 Issue 的个数

In [17]:
draw(x_label, issue_opened_data, issue_closed_data, "Opened Issue", "Closed Issue", "每月打开Issue & 关闭Issue的个数" ).render_notebook()

### 每月打开 PR 和合入 PR 的个数

In [18]:
pr_merged_df = pd.read_excel("./data/tensorflow_pr_merged.xlsx")
pr_opened_df = pd.read_excel("./data/tensorflow_pr.xlsx")
pr_opened_data = pr_opened_df['Pr'].tolist()
pr_merged_data = pr_merged_df['Pr_Merged'].tolist()

In [19]:
draw(x_label, pr_opened_data, pr_merged_data, "PR", "Merged PR", "每月打开PR & 合入PR的个数" ).render_notebook()

###  每月在仓库中活跃（只要有日志产生就算）的不同开发者总数

In [20]:
events_df = pd.read_excel("./data/tensorflow_event.xlsx")
event_data = events_df['Event'].tolist()

In [21]:
c = (
    Line(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(x_label)
    .add_yaxis(
        "每月活跃数",
        is_smooth=True, 
        symbol="emptyCircle", 
        is_symbol_show=False,
        color="#6e9ef1",
        y_axis=event_data,
        label_opts=opts.LabelOpts(is_show=False),
    ) 
    .set_series_opts(
        label_opts=opts.LabelOpts(
            is_show=False,
            position="inside",
            formatter="{c}",
        )
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每月在仓库中活跃的不同开发者总数"),
        tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "cross",trigger="axis"),
        xaxis_opts=opts.AxisOpts(
            name_rotate=60,axislabel_opts={"rotate":45},
            axispointer_opts=opts.AxisPointerOpts(
                is_show=True,
            ),  
        ),
    )
)

c.render_notebook()

###   Issue 从打开到关闭的平均时长和中位数

In [22]:
issue_time_df = pd.read_excel("./data/tensorflow_issue_closed_time.xlsx")
issue_time_data = issue_time_df[issue_time_df['Time_used']<30]['Time_used'].tolist()

In [23]:
print(np.median(issue_time_df['Time_used']))
print(np.average(issue_time_df['Time_used']))

14.0
106.6514740404256


###  PR 从打开到合入的平均时长和中位数

In [24]:
pr_time_df = pd.read_excel("./data/tensorflow_pr_merged_time.xlsx")
print(np.median(pr_time_df['Time_used']))
print(np.average(pr_time_df['Time_used']))

1.0
11.84401855135725


###   Issue和PR从打开到第一次有人回复（非本人回复）的平均时长和中位数

In [25]:
reply_time_df = pd.read_excel("./data/tensorflow_replied_time.xlsx")
reply_time_df_temp = reply_time_df[reply_time_df['Time_used']<999999]
print(reply_time_df.count())
print(reply_time_df[reply_time_df['Time_used']<999999].count()) # 所有有人回复的
print(np.median(reply_time_df_temp['Time_used']))
print(np.average(reply_time_df_temp['Time_used']))

Issue_id     51553
Time_used    51553
dtype: int64
Issue_id     48837
Time_used    48837
dtype: int64
1.0
15.390277863095603
