# 数据获取

## 本小组数据获取基于由api封装的Github库

In [102]:
import time
import json
import github
import pandas as pd
from github import Github
from collections import OrderedDict
from datetime import datetime, timedelta
token = '' # 为了安全性考虑，这里不贴出个人token，如有需要重跑代码，辛苦助教自己生成

In [103]:
# 用于所有任务，获取相关类型数据
# type指定查询类型——fork、pr、star等
def get_data(repo, type):
    if type == "fork":
        contents = repo.get_forks()
    elif type == "pr":
        # contents = repo.get_pulls(state='all')
        contents = repo.get_pulls(state='closed')
    elif type == "commit":
        contents = repo.get_commits()
    elif type == "issue_comment":
        contents = repo.get_issues_comments()
    elif type == "star":
        contents = repo.get_stargazers_with_dates()
    elif type == "pr_comment":
        contents = repo.get_pulls_comments()
    elif type == "commit_comment":
        contents = repo.get_comments()
    elif type == "issue":
        # contents = repo.get_issues(state='all')
        contents = repo.get_issues(state='closed')
    return contents

In [104]:
# 用于任务1,2,3
# 将数据写入Excel文件，方便后续处理
def write_to_file(value_list, data_axis): 
    df = pd.DataFrame({'Issue_Id': data_axis, 'Time_Used': value_list})
    filename = './data/mozilla_send_issue_replied_time.xlsx'
    df.to_excel(filename, index=False)

In [105]:
# 用于任务1,2,3,4
# 工具函数（生成时间线）
def create_assist_date(start=None, end=time.strftime('%Y.%m.%d')):
    start = datetime.strptime(start, '%Y.%m.%d')
    end = datetime.strptime(end, '%Y.%m.%d')
    return list(OrderedDict(((start + timedelta(_)).strftime("%Y.%m"), None) for _ in range((end - start).days)).keys())

In [106]:
# 用于任务1,2,3
# 工具函数（累加）
def sum_list(result):
    temp = result.copy()
    for i in range(1, len(result)):
        temp[i] = temp[i - 1] + temp[i]
    return temp

In [107]:
# 用于任务7
# Github库缺少访问指定issue的comment功能，通过comments_url实现
def get_issue_comments(requester, comments_url):
    return github.PaginatedList.PaginatedList(github.IssueComment.IssueComment, requester, comments_url, dict())

In [108]:
# 用于任务1,2,3，计算相关数量
# type指定查询类型——fork、pr、star等
def handle_data(repo, type_timeline, type):
    for content in get_data(repo, type):
        # 用于任务3
        # if content.merged:
        #     date = content.merged_at.date().strftime('%Y.%m')
        #     type_timeline[date] += 1
        
        # 用于任务1,2
        if type == "commit":
            date = content.commit.committer.date.date().strftime('%Y.%m')
        else:
            # date = content.closed_at.date().strftime('%Y.%m')
            # date = content.starred_at.date().strftime('%Y.%m')
            date = content.created_at.date().strftime('%Y.%m')
        if type == "issue" and content.pull_request:
            continue
        else:
            type_timeline[date] += 1
    return type_timeline

In [109]:
# 用于任务4, 返回创建者和日期
# type指定查询类型——fork、pr、star等
def handle_meta(type, content):
    if type == 'commit':
        return content.author, content.commit.committer.date.date().strftime('%Y.%m')
    elif type == 'fork':
        return content.owner, content.created_at.date().strftime('%Y.%m')
    elif type == 'star':
        return content.user, content.starred_at.date().strftime('%Y.%m')
    else:
        return content.user, content.created_at.date().strftime('%Y.%m')

In [110]:
# 用于任务5,6,7，计算相关时间差
# type指定查询类型——fork、pr、star等
def handle_time(repo, type):
    issue_list = []
    day_list = []
    for content in get_data(repo, type):
        # 用于任务6
        # if content.merged:
        #     issue_list.append(content.number)
        #     day_list.append((content.merged_at.date() - content.created_at.date()).days)

        # 用于任务5
        # if type == "issue" and content.pull_request:
        #     continue
        # else:
        #     issue_list.append(content.number)
        #     day_list.append((content.closed_at.date() - content.created_at.date()).days)
        
        # 用于任务7
        if content.comments:
            owner_id = content.user.id
            min_time = 999999
            create_time = content.created_at.date()
            issue_contents = get_issue_comments(repo._requester, content.comments_url)
            for issue_content in issue_contents:
                # 排除自己的回复并取最小值
                if owner_id != issue_content.user.id:
                    min_time = min((issue_content.created_at.date() - create_time).days, min_time)
            issue_list.append(content.number)
            day_list.append(min_time)
    return issue_list, day_list

In [None]:
# 用于任务1,2,3
repo = Github(token).get_repo('mozilla/send')
data_axis = create_assist_date(repo.created_at.date().strftime('%Y.%m.%d'))
type_timeline = handle_data(repo, {}.fromkeys(data_axis, 0), "commit")
write_to_file(list(type_timeline.values()), data_axis)

In [None]:
# 用于任务4
repo = Github(token).get_repo('mozilla/send')
data_axis = create_assist_date(repo.created_at.date().strftime('%Y.%m.%d'))

    '''
    ForkEvent IssueCommentEvent  PullRequestReviewCommentEvent WatchEvent CommitCommentEvent
    IssuesEvent PullRequestEvent
    CreateEventPushEvent MemberEvent DeleteEvent 
    '''

type_list = ['fork', 'issue', 'issue_comment', 'pr_comment', 'star', 'commit', 'commit_comment']
contributors = {}
for data in data_axis:
    contributors[data] = []
type_timeline = {}.fromkeys(data_axis, 0)
for type_now in type_list: # 请求所有类型的event
    for content in get_data(repo, type_now):
        user, date = handle_meta(type_now, content)
        # 查看这个月贡献者是否已包含此人
        if user and user.id not in contributors[date]:
            # 不包含加入列表
            contributors[date].append(user.id)
            type_timeline[date] += 1
# 断点续存，非必要
with open('contributors.json', 'w') as file:
    json.dump(contributors, file)
# print(contributors)
write_to_file(list(type_timeline.values()), data_axis)

In [None]:
# 用于任务5,6,7
issue_list, day_list = handle_time(Github(token).get_repo('mozilla/send'), 'issue')
write_to_file(day_list, issue_list)

# 数据分析

In [32]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_path = "./data"

In [4]:
star_data_path = "./data/mozilla_send_star.xlsx"
star_df = pd.read_excel(star_data_path)
star_df.head()

Unnamed: 0,Timeline,Stars,Total_Stars
0,2017.05,1,1
1,2017.06,2,3
2,2017.07,8,11
3,2017.08,1541,1552
4,2017.09,71,1623


In [13]:
from pyecharts import options as opts
from pyecharts.charts import Bar,Line,Pie,WordCloud,EffectScatter,Boxplot
from pyecharts.charts import PictorialBar
from pyecharts.globals import SymbolType
from pyecharts.globals import ThemeType

### 每月新增 Star 和 Frok 的个数

In [9]:
x_label = np.array(star_df['Timeline']).astype(np.string_).tolist()[:50]
star_data = np.array(star_df['Stars']).tolist()

In [8]:
fork_df = pd.read_excel("./data/mozilla_send_fork.xlsx")
fork_data = np.array(fork_df['Forks']).tolist()

In [10]:
def draw(x_label, y1_data, y2_data, y1_name, y2_name, title):
    c = (
    Line(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(x_label)
    .add_yaxis(
        y1_name,
        is_smooth=True, 
        symbol="emptyCircle", 
        is_symbol_show=False,
        color="#6e9ef1",
        y_axis=y1_data,
        label_opts=opts.LabelOpts(is_show=False),
    ) 
    .add_yaxis(
        y2_name,
        is_smooth=True, 
        symbol="emptyCircle", 
        is_symbol_show=False,
        color="#d14a61",
        y_axis=y2_data,
        label_opts=opts.LabelOpts(is_show=False),
    ) 
    .set_series_opts(
        label_opts=opts.LabelOpts(
            is_show=False,
            position="inside",
            formatter="{c}",
        )
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title=title),
        tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "cross",trigger="axis"),
        xaxis_opts=opts.AxisOpts(
            name_rotate=60,axislabel_opts={"rotate":45},
            axispointer_opts=opts.AxisPointerOpts(
                is_show=True,
            ),  
        ),
    )
)

    return c

In [11]:
draw(x_label, star_data, fork_data, "新增Star", "新增fork", "每月新增Star & Frok 的个数" ).render_notebook()

In [12]:
issue_opened_df = pd.read_excel("./data/mozilla_send_issue_opened.xlsx")
issue_closed_df = pd.read_excel("./data/mozilla_send_issue_closed.xlsx")
issue_opened_data = issue_opened_df['Issues_Opened'].tolist()
issue_closed_data = issue_closed_df['Issues_Closed'].tolist()

### 每月打开 Issue 和 关闭 Issue 的个数

In [13]:
draw(x_label, issue_opened_data, issue_closed_data, "Opened Issue", "Closed Issue", "每月打开Issue & 关闭Issue的个数" ).render_notebook()

### 每月打开 PR 和合入 PR 的个数

In [14]:
pr_merged_df = pd.read_excel("./data/mozilla_send_pr_merged.xlsx")
pr_opened_df = pd.read_excel("./data/mozilla_send_pr_opened.xlsx")
pr_opened_data = pr_opened_df['Prs'].tolist()
pr_merged_data = pr_merged_df['Prs_Merged'].tolist()

In [15]:
draw(x_label, pr_opened_data, pr_merged_data, "PR", "Merged PR", "每月打开PR & 合入PR的个数" ).render_notebook()

###  每月在仓库中活跃（只要有日志产生就算）的不同开发者总数

In [16]:
events_df = pd.read_excel("./data/mozilla_send_event.xlsx")
event_data = events_df['Events'].tolist()

In [17]:
c = (
    Line(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(x_label)
    .add_yaxis(
        "每月活跃数",
        is_smooth=True, 
        symbol="emptyCircle", 
        is_symbol_show=False,
        color="#6e9ef1",
        y_axis=event_data,
        label_opts=opts.LabelOpts(is_show=False),
    ) 
    .set_series_opts(
        label_opts=opts.LabelOpts(
            is_show=False,
            position="inside",
            formatter="{c}",
        )
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每月在仓库中活跃的不同开发者总数"),
        tooltip_opts=opts.TooltipOpts(is_show=True,axis_pointer_type= "cross",trigger="axis"),
        xaxis_opts=opts.AxisOpts(
            name_rotate=60,axislabel_opts={"rotate":45},
            axispointer_opts=opts.AxisPointerOpts(
                is_show=True,
            ),  
        ),
    )
)

c.render_notebook()

###   Issue 从打开到关闭的平均时长和中位数

In [86]:
issue_time_df = pd.read_excel("./data/mozilla_send_issue_closed_time.xlsx")
issue_time_data = issue_time_df[issue_time_df['Time_Used']<30]['Time_Used'].tolist()

In [37]:
print(np.median(issue_time_df['Time_Used']))
print(np.average(issue_time_df['Time_Used']))

8.0
54.11219512195122


###  PR 从打开到合入的平均时长和中位数

In [82]:
pr_time_df = pd.read_excel("./data/mozilla_send_pr_merged_time.xlsx")
pr_time_data = pr_time_df[pr_time_df['Time_Used']<30]['Time_Used'].tolist()

In [38]:
print(np.median(pr_time_df['Time_Used']))
print(np.average(pr_time_df['Time_Used']))

0.0
4.784722222222222


###   Issue和PR从打开到第一次有人回复（非本人回复）的平均时长和中位数

In [83]:
reply_time_df = pd.read_excel("./data/mozilla_send_replied_time.xlsx")
reply_time_df_temp = reply_time_df[reply_time_df['Time_Used']<999999]
reply_time_data = reply_time_df_temp[reply_time_df_temp['Time_Used']<30]['Time_Used'].tolist()

In [48]:
print(reply_time_df.count())
print(reply_time_df[reply_time_df['Time_Used']<999999].count()) # 所有有人回复的
print(np.median(reply_time_df_temp['Time_Used']))
print(np.average(reply_time_df_temp['Time_Used']))

Issue_Id     1057
Time_Used    1057
dtype: int64
Issue_Id     879
Time_Used    879
dtype: int64
1.0
18.25028441410694


In [96]:
c = Boxplot()
c.add_xaxis(["Issue&PR相关操作所用时间箱型图（单位：天）"])
c.add_yaxis("Issue打开到关闭", c.prepare_data([issue_time_data]))
c.add_yaxis("Pull Request打开到合入", c.prepare_data([reply_time_data]))
c.add_yaxis("Issue&PR打开到非本人回复", c.prepare_data([pr_time_data]))
c.render_notebook()