In [1]:
import requests
import joblib
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pandas import option_context
from prompt_toolkit.contrib.telnet import TelnetServer
from tqdm import tqdm
from statsmodels.stats.inter_rater import aggregate_raters
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mannwhitneyu,shapiro,ttest_ind

import cliffsDelta as cd
import GenerateActivities as gat
import important_features as imf

In [2]:
QUERY_ROOT = "https://api.github.com"
with open('MYTOKEN.txt', 'r') as file:
    TOKEN = file.read() # write your GitHub API key here
HEADERS={'Authorization':'token '+TOKEN}

## Section 3

### Question 1

#### Write an automated Python script to identify all GitHub bot actors in the list of contributors, by querying the GitHub REST API users endpoint and extracting the required information to make this decision. 

#### More information: You can search for the "type" key in the obtianed JSON result, If it is "Bot", then it is a bot actor, if it is "User", then it is an account.

In [3]:
# 1. Read the given <sample>.csv file into a pandas DataFrame (df).
# 2. Make sure there are 2 columns, one column name is "contributor" that has the name of the contributor, 
#    and another column name is "bothunter_type" that you will use later on in this assignment.
# 3. Display the DataFrame.

# YOUR CODE BELOW
df = pd.read_csv("Contributors/sample19.csv")
df

Unnamed: 0,contributor,bothunter_type
0,Carreau,Human
1,IanButterworth,Human
2,NilsKrattinger,Human
3,Sonja-Stockhaus,Human
4,Zybulon,Human
5,anilbey,Human
6,beckermr,Human
7,bioc-issue-bot,Bot
8,bwengals,Human
9,chainer-ci,Bot


In [4]:
# 1. Store the names of the contributors in a list
# 2. print the list

# YOUR CODE BELOW
contributors = df.contributor
contributors

0                                  Carreau
1                           IanButterworth
2                           NilsKrattinger
3                          Sonja-Stockhaus
4                                  Zybulon
5                                  anilbey
6                                 beckermr
7                           bioc-issue-bot
8                                 bwengals
9                               chainer-ci
10                                commit-0
11                               coveralls
12                            danielvartan
13                            editorialbot
14                                effigies
15    galaxyproject-sentryintegration[bot]
16                                gbaraldi
17                     github-actions[bot]
18           github-advanced-security[bot]
19                         hackmd-hub[bot]
20                                  hrhotz
21                             imagesc-bot
22                                  jdavcs
23         

In [5]:
# 1. Using a for loop, iterate over each contributor in the list and query the GitHub Users API.
#    You can use "query = f'{QUERY_ROOT}/users/{contributor}'", where QUERY_ROOT is defined at the beginning of this notebook 
#    and 'contributor' is each individual contributor from the list
# 2. Get the response using 'response = requests.get(query, headers=HEADERS)'.
# 3. convert the response to JSON using 'json_response = response.json()'.
# 4. Iterate over each JSON response and get the value of the 'type' key. If it is "Bot" then the contributor is a bot actor, 
#    if "User" then the contributor is an account. You should boolean values to indicate if the contributor is a bot actor (True) or User/Organisation (False)
# 5. Save these results in list of dictionary of the form [{'contributor': <contributor name>, 'app': <boolean value>}, {...}, {...}, {...}].
#    Lets call this list as "app_dict"
# 6. Finally convert this list of dictionary to DataFrame by writing 'pd.DataFrame.from_dict(app_dict)'
# 7. Display the DataFrame. This should have two columns - contributor and app

# YOUR CODE BELOW
app_dict = list()

for contributor in contributors:
    query = f'{QUERY_ROOT}/users/{contributor}'
    response = requests.get(query, headers=HEADERS)
    json_response = response.json()
    app_dict.append({'contributor': contributor, 'app': json_response['type'] == 'Bot'})

app_df = pd.DataFrame.from_dict(app_dict)
app_df



Unnamed: 0,contributor,app
0,Carreau,False
1,IanButterworth,False
2,NilsKrattinger,False
3,Sonja-Stockhaus,False
4,Zybulon,False
5,anilbey,False
6,beckermr,False
7,bioc-issue-bot,False
8,bwengals,False
9,chainer-ci,False


### Question 2

#### Add a new 'actor' column in the CSV file to store this information. You can write True if it is a bot actor and False if it is not.

#### Report on the total number of GitHub bot actors, and User accounts present in the list of accounts of your dataset.

In [6]:
# Merge the app DataFrame to df by writing 'pd.merge(df, <app df>, on='contributor'). This is similar to SQL join on primary key 'contributor'.
# The resultant df should have 3 columns - contributor, bothunter_type and user_type.

# YOUR CODE BELOW
df_merge = pd.merge(df,app_df, on='contributor')
df_merge.rename(columns={'app':'user_type'}, inplace=True)
df_merge

Unnamed: 0,contributor,bothunter_type,user_type
0,Carreau,Human,False
1,IanButterworth,Human,False
2,NilsKrattinger,Human,False
3,Sonja-Stockhaus,Human,False
4,Zybulon,Human,False
5,anilbey,Human,False
6,beckermr,Human,False
7,bioc-issue-bot,Bot,False
8,bwengals,Human,False
9,chainer-ci,Bot,False


### Question 3

#### For each bot actor, write their purpose, the task that they automate by looking at their GitHub profile, homepage, information on the GitHub Marketplace, or other documentation that you can find online.


In [7]:
bots = df_merge[df_merge['user_type']]
bots

Unnamed: 0,contributor,bothunter_type,user_type
15,galaxyproject-sentryintegration[bot],Bot,True
17,github-actions[bot],Bot,True
18,github-advanced-security[bot],Bot,True
19,hackmd-hub[bot],Bot,True
26,lumberbot-app[bot],Bot,True
34,review-notebook-app[bot],Bot,True
37,transifex-integration[bot],Bot,True


| Bot Actor                   | Purpose                                                                 | Automated Tasks |
|-----------------------------|-------------------------------------------------------------------------|----------------|
| **galaxyproject-sentryintegration** | Integrates Sentry's error tracking capabilities into the Galaxy Project, a platform for data-intensive biomedical research. | Automatically reports errors and exceptions from the Galaxy Project to Sentry, facilitating real-time monitoring and debugging. |
| **github-actions**          | Automates workflows directly within GitHub repositories.              | Enables continuous integration and continuous deployment (CI/CD) by automating tasks such as testing, building, and deploying code. |
| **github-advanced-security** | Enhances the security of codebases hosted on GitHub.                  | Provides features like code scanning, secret scanning, and dependency reviews to identify and mitigate security vulnerabilities within repositories. [docs.github.com](https://docs.github.com) |
| **hackmd-hub**              | Integrates HackMD, a collaborative markdown editor, with GitHub.      | Synchronizes markdown documents between HackMD and GitHub repositories, allowing collaborative editing and version control. |
| **review-notebook-app**     | Facilitates the review of Jupyter Notebooks within GitHub pull requests. | Renders Jupyter Notebooks in pull requests, enabling reviewers to view rich notebook content without leaving GitHub. |
| **transifex-integration**   | Connects GitHub repositories with Transifex, a localization platform. | Automates the synchronization of localization files between GitHub and Transifex, ensuring that new content is translated and updated translations are pushed back to the repository. |


## Section 4

### Question 1

#### Write an automated script to use BIMBAS (Bot Identification Model Based on Activity Sequences) to obtain a prediction of the GitHub contributor type (Bot or Human) for the contributors present in your dataset.

#### Folow the steps given in each cell to use BIMBAS and obtain predictions.

### Read all the events and select the events performed by the contributors present in the given set

In [8]:
# 1. Read the csv file of events (all_events.csv) provided along with this notebook
# 2. Convert created_at column to datetime format 
#    One possible way is to use lambda function: "events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))"
# 3. Get the contributors provided to you in a list or use the contributors list that you created in Section 3 Question 1. 
#    e.g., ['contributor1', 'contributor2', 'contributor3',...]
# 4. Select the events performed by these contributors alone. Do not consider all the events. You should consider only the events
#    performed by the contributors provided to you. Hint: you can use the df.query function
# 5. Display the considered events.

# YOUR CODE BELOW
events = pd.read_csv('./all_events.csv')
events = events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))
filtered_events = events.query('login in @contributors')
filtered_events

Unnamed: 0.1,Unnamed: 0,event_id,event_type,login,repository,created_at,action,PR_number,state,PR_node_id,...,comment_node_id,merged,ref,ref_type,issue_number,issue_node_id,issue_closed_at,tag_name,release_node_id,org
6,6,39747396929,CommitCommentEvent,github-actions[bot],QuantEcon/lecture-python-intro,2024-07-01 00:01:47,,,,,...,CC_kwDOIy5DFs4IkKwX,,,,,,,,,QuantEcon
15,15,39747413489,CommitCommentEvent,github-actions[bot],QuantEcon/lecture-python-intro,2024-07-01 00:03:24,,,,,...,CC_kwDOIy5DFs4IkKw0,,,,,,,,,QuantEcon
16,16,39747416135,PushEvent,github-actions[bot],napari/npe2api,2024-07-01 00:03:40,,,,,...,,,,,,,,,,napari
21,21,39747427332,CreateEvent,github-actions[bot],SciML/PubChem.jl,2024-07-01 00:04:44,,,,,...,,,compathelper/new_version/2024-07-01-00-04-42-9...,branch,,,,,,SciML
30,30,39747452343,IssueCommentEvent,github-actions[bot],pandas-dev/pandas,2024-07-01 00:07:06,created,58814.0,open,PR_kwDOAA0YD85wSOEO,...,IC_kwDOAA0YD86DD0Tu,,,,,,,,,pandas-dev
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358374,358374,42415166170,IssueCommentEvent,editorialbot,openjournals/joss-reviews,2024-09-30 23:15:43,created,,open,,...,IC_kwDOAx6EUs6OHpbv,,,,7256.0,I_kwDOAx6EUs6XbOF-,,,,openjournals
358391,358391,42415303969,PushEvent,github-actions[bot],SciML/LineSearch.jl,2024-09-30 23:24:21,,,,,...,,,,,,,,,,SciML
358401,358401,42415474205,PushEvent,github-actions[bot],Bioconductor/gha-build-jupyter-release-amd64,2024-09-30 23:34:53,,,,,...,,,,,,,,,,Bioconductor
358403,358403,42415512378,PushEvent,github-actions[bot],napari/npe2api,2024-09-30 23:37:12,,,,,...,,,,,,,,,,napari


events = pd.read_csv('./all_events.csv')
events = events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))
filtered_events = events.query('login in @contributors')
filtered_events### Execute BIMBAS to obtain the predictions

In [9]:
# Do not modify this cell
# Use the following function the code below to obtain your predictions

def execute_bimbas(contributors_list, selected_events):
    '''
    args: contributors_list (list) - list of contributors in the dataset provided to you
          selected_events (DataFrame) - DataFrame of events performed by the considered contributors
    return: bimbas_prediction (DataFrame) - DataFrame of preditions along with the confidence in prediction for each contributor
    '''
    result=pd.DataFrame()
    temp_list = []
    bimbas = joblib.load('bimbas.joblib')
    date_limit = pd.to_datetime(selected_events.created_at.max()) + pd.DateOffset(-90)
    
    for contributor in tqdm(contributors_list):
        # c = contributors[contributor]
        activities = gat.activity_identification(selected_events.query('login==@contributor and created_at>=@date_limit'))
        activity_features = (
                            imf.extract_features(activities)
                            .set_index([[contributor]])
                            )
        # features = pd.concat([features,activity_features])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            probability = bimbas.predict_proba(activity_features)[0][1]
        if(probability <= 0.5):
            contributor_type = 'Human'
        else:
            contributor_type = 'Bot'
        confidence = (abs(probability - 0.5)*2).round(3)
    
        pred = activity_features.set_index([[contributor]]).assign(
            prediction=contributor_type,
            confidence = confidence,
            )
        temp_list.extend([{'contributor':contributor,'bimbas_type':contributor_type,'confidence':confidence}])
    
    bimbas_prediction = pd.DataFrame.from_dict(temp_list)

    return(bimbas_prediction)

In [10]:
# Write your code here to call execute_bimbas function. 
# Pass the contributors list and the filtered events DataFrame as argument to "execute_bimbas" function
# Prediction returned by BIMBAS will have "contributor", "bimbas_type" and "confidence"
# Note: Print the prediction provided by BIMBAS - else this cell will not be graded.

# YOUR CODE BELOW
bimbas_prediction = execute_bimbas(contributors, filtered_events)
bimbas_prediction

100%|██████████| 40/40 [00:09<00:00,  4.41it/s]


Unnamed: 0,contributor,bimbas_type,confidence
0,Carreau,Human,0.923
1,IanButterworth,Human,0.641
2,NilsKrattinger,Human,0.834
3,Sonja-Stockhaus,Human,0.92
4,Zybulon,Human,0.94
5,anilbey,Bot,0.731
6,beckermr,Human,0.875
7,bioc-issue-bot,Bot,0.548
8,bwengals,Human,0.858
9,chainer-ci,Bot,0.723


### Create a column in your predictions DataFrame to have predictions provided by BIMBAS. 
### Now your DataFrame should have the following columns - contributor, bothunter_type, app, bimbas_type, confidence 

### Merging bothunter and bimbas predictions

In [11]:
# Merge the predictions provided by BIMBAS to the resultant dataset of Section 3. This final DataFrame should have the following columns - 
# contributor, bothunter_type, user_type, bimbas_type, and confidence.
merged_prediction_df = pd.merge(df_merge, bimbas_prediction, on='contributor')
merged_prediction_df

Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence
0,Carreau,Human,False,Human,0.923
1,IanButterworth,Human,False,Human,0.641
2,NilsKrattinger,Human,False,Human,0.834
3,Sonja-Stockhaus,Human,False,Human,0.92
4,Zybulon,Human,False,Human,0.94
5,anilbey,Human,False,Bot,0.731
6,beckermr,Human,False,Human,0.875
7,bioc-issue-bot,Bot,False,Bot,0.548
8,bwengals,Human,False,Human,0.858
9,chainer-ci,Bot,False,Bot,0.723


### Question 2
#### Using Cohen's Kappa compute and report the interrater agreement score between the labels computed by both the bot identification approaches. Mention your interpretation of Cohen's Kappa

In [14]:
# Hint: use the cohen_kappa_score library
# YOUR CODE BELOW
from sklearn.metrics import cohen_kappa_score

kappa = cohen_kappa_score(merged_prediction_df['bothunter_type'], merged_prediction_df['bimbas_type'])
kappa

np.float64(0.7222222222222222)

### Question 3
#### Determine the final type of each contributor. Whenever user_type column has the value "User" check if both bimbas_type and bothunter_type give the same prediction, then consider it as your final prediction in "acc_type" column.
#### For the contributors that have different predictions, i.e. bimbas_type is not same as bothunter_type, then make a manual verification and consider that as the type for that contributor. Add an extra column to the DataFrame ﬁle called 'manual' that has the label determined by you. 
#### For manual verification you can make use the GitHub UI and the GitHub API https://api.github.com/users/username

#### Whenever user_type column has the value "Bot", directly write your final acc_type as "Bot Actor"

#### More information:
For each contributor, you can look at their activities in GitHub UI, look at their latest events using GitHub Events API - https://api.github.com/users/<contributor>/events, to make a decision on their type.

In addition, you can also write a very small reason why do you think they are bot or human (e.g., same activity at regular interval so Bot, files committed and code modified looks like Human, comments look like Human, comment look like Bot, test report looks automated so Bot, and so on....). This will be useful to answer your next question.

In [15]:
# Write the code for your manual decision here
# example : 
# manual_list_dict = [{'contributor': '<contributor name>', 'manual': '<your prediction>'}, {....}, {....}, {....}]
# pd.DataFrame.from_dict(manual_list_dict)

# YOUR CODE BELOW
bot_deter_df = merged_prediction_df.copy()
bot_deter_df['acc_type'] = bot_deter_df.apply(lambda row: "Manual" if row['bimbas_type']!= row['bothunter_type'] else row['bimbas_type'], axis=1)
bot_deter_df['acc_type'] = bot_deter_df.apply(lambda row: "Bot" if 'bot' in row['contributor'].lower() else row["acc_type"], axis=1)
bot_deter_df

Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence,acc_type
0,Carreau,Human,False,Human,0.923,Human
1,IanButterworth,Human,False,Human,0.641,Human
2,NilsKrattinger,Human,False,Human,0.834,Human
3,Sonja-Stockhaus,Human,False,Human,0.92,Human
4,Zybulon,Human,False,Human,0.94,Human
5,anilbey,Human,False,Bot,0.731,Manual
6,beckermr,Human,False,Human,0.875,Human
7,bioc-issue-bot,Bot,False,Bot,0.548,Bot
8,bwengals,Human,False,Human,0.858,Human
9,chainer-ci,Bot,False,Bot,0.723,Bot


In [21]:
# Write your code below here to determine the final type of contributor.
# add column named 'acc_type' and write your final decision in it. Final decision is majority of three types - bimbas_type, bothunter_type, manual.

manual = bot_deter_df[bot_deter_df['acc_type'] == 'Manual']

print("anilbey is not a bot because he have a correct pseudo")
print("commit-0 is a bot he commits in several library, he have a pseudo think he is a bot, he doesnt have picture in profile, etc ...")
print("nanosoldier is not a bot because he tell in description of the user")

to_humain = ['anilbey']
to_bot = ['commit-0','nanosoldier']

# Replace acc_type of theseusers according to the manual researches.
bot_deter_df['Manual'] = bot_deter_df.apply(
    lambda row: "Bot" if 'mirror-dump' == row['contributor'] else row["acc_type"], axis=1)
bot_deter_df['Manual'] = bot_deter_df.apply(lambda row: "Human" if row['contributor'] in to_humain else row["Manual"], axis=1)
bot_deter_df['Manual'] = bot_deter_df.apply(lambda row: "Bot" if row['contributor'] in to_bot else row["Manual"], axis=1)
bot_deter_df

anilbey is not a bot because he have a correct pseudo
commit-0 is a bot he commits in several library, he have a pseudo think he is a bot, he doesnt have picture in profile, etc ...
nanosoldier is not a bot because he tell in description of the user


Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence,acc_type,Manual
0,Carreau,Human,False,Human,0.923,Human,Human
1,IanButterworth,Human,False,Human,0.641,Human,Human
2,NilsKrattinger,Human,False,Human,0.834,Human,Human
3,Sonja-Stockhaus,Human,False,Human,0.92,Human,Human
4,Zybulon,Human,False,Human,0.94,Human,Human
5,anilbey,Human,False,Bot,0.731,Manual,Human
6,beckermr,Human,False,Human,0.875,Human,Human
7,bioc-issue-bot,Bot,False,Bot,0.548,Bot,Bot
8,bwengals,Human,False,Human,0.858,Human,Human
9,chainer-ci,Bot,False,Bot,0.723,Bot,Bot


### Question 4
#### Study and report the purpose of these identified bots and humans in the repository assigned to you.
#### For example, the purpose can be based on but not limited to - 
##### 1) the type of activities that they are performing (releasing a version on every Sunday, updating the documentation), 
##### 2) type of comments they are posting (reviewing code, test summary report), 
##### 3) when are they triggered (when a new PR is created, when someone in the project ask the bot to merge the code) and so on.  
#### Note: 
##### The purpose can be other than what is provided in examples above.

## Section 5

### Question 1
#### Use the filtered events file that has the events performed by the contributors provided to you
#### Group each event into the following four categories

Issues: IssueCommentEvent, IssuesEvent  
Pull Requests: PullRequestEvent, PullRequestReviewCommentEvent  
Commits: CommitCommentEvent, PushEvent  
Repository: CreateEvent, DeleteEvent, ForkEvent, GollumEvent, MemberEvent, PublicEvent, ReleaseEvent, SponsorshipEvent, WatchEvent  

#### Hint:
1. Add a column called event_group that mentions which event group does that event belong to. Each event (row) should correspond to an event group.  
2. Then perform groupby on ['login','event_group'],  
3. use .agg (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html) to count the number of events performed by each contributor in each group,  
4. use pivot with the required arguments (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html). An example is given [here](#pivot_example) in question 2(b), and
5. Reset index and rename axis with None
7. Finally merge it with your 'acc_type' field from the DataFrame you created in Section4, drop contributor, and fillna with 0
8. The final DataFrame should have the following columns - login, PR, commit, issue, repo, acc_type

The final DataFrame should be looking like the following  
![](event_group.png)

In [32]:
# In the resultant DataFrame - each row should corespond to a contributor, and the columns should have all the event groups  
# and the type of the contributor (that you decided in the previous DataFrame) and the values should be the number 
# of events of that event event group the contriubutor has performed.

issue_group = ['IssueCommentEvent', 'IssuesEvent']
pr_group = ['PullRequestEvent', 'PullRequestReviewCommentEvent']
commit_group = ['CommitCommentEvent', 'PushEvent']
repo_group = ['CreateEvent', 'DeleteEvent', 'ForkEvent', 'GollumEvent', 'MemberEvent', 'PublicEvent', 'ReleaseEvent', 'SponsorshipEvent', 'WatchEvent']

In [19]:
# YOUR CODE BELOW
filtered_events['event_group'] = filtered_events['type'].apply(
    lambda x: 'issue' if x in issue_group else
              'PR' if x in pr_group else
              'commit' if x in commit_group else
              'repo' if x in repo_group else
              'other'
)

In [20]:
# YOUR CODE BELOW


### Question 2 (a)

#### Compute the median number of events per event group for Bot+Bot actors and Humans and write in DataFrame.

Row should correspond to type (Bot_BotActor and Human), Column should have Event group name and the values should be the median value of Bot_BotActor or Human for that particular event group. An example is given below

In [15]:
# For example:
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot_app'}, 
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot_app
1,event_group1,val2,Human
2,event_group2,val3,Bot_app
3,event_group2,val4,Human
4,event_group3,val5,Bot_app
5,event_group3,val6,Human
6,event_group4,val7,Bot_app
7,event_group5,val8,Human


In [22]:
# YOUR CODE BELOW

### Question 2 (b)

Plot a heatmap of the DataFrame using seaborn - 
1. First convert the dataframe to the required format using pivot, example is given below
2. plot using seaborn - sns.heatmap(df_medians, annot=True, vmin=0, vmax=300, cmap="crest"). More details: https://seaborn.pydata.org/generated/seaborn.heatmap.html)

#### pd.pivot example:
<a id='pivot_example'></a>

In [17]:
# Main DataFrame
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot'}, 
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot
1,event_group1,val2,Human
2,event_group2,val3,Bot
3,event_group2,val4,Human
4,event_group3,val5,Bot
5,event_group3,val6,Human
6,event_group4,val7,Bot
7,event_group5,val8,Human


In [18]:
# pivot the main DataFrame
df_medians.pivot(index='acc_type', columns='event_group', values='median')#.reset_index().rename_axis(None,axis=1)

event_group,event_group1,event_group2,event_group3,event_group4,event_group5
acc_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bot,val1,val3,val5,val7,
Human,val2,val4,val6,,val8


In [25]:
# YOUR CODE BELOW

In [26]:
# YOUR CODE BELOW


#### What is the difference that you observe between Bots+Bot actors and Humans?

#### What is the differnce that you observe between Event groups?

#### What is the difference that you observe between Bots+Bot actors and Humans and Event groups all considering at the same time?

### Question 3

#### Create boxen plots to visualise the distribution of number of events in each event group. 
#### For more information you can visit - https://seaborn.pydata.org/generated/seaborn.boxenplot.html#seaborn.boxenplot 
#### You should highlight the data points that correspond to bots using a stripplot in seaborn. https://seaborn.pydata.org/tutorial/categorical.html#categorical-tutorial  
#### Interpret the results of the visualisation.

In [27]:
# YOUR CODE BELOW - Visualize number of events from pull request event group


In [28]:
# YOUR CODE BELOW - Visualise number of events from issue event group


In [29]:
# YOUR CODE BELOW - Visualise number of events from commit event group


In [30]:
# YOUR CODE BELOW - Visualise number of events from repo event group


### Question 4.1

#### Statistical identify whether number of events in each event group is normally distributed or not.
#### Null hypothesis - $H_0$: Sample comes from the data that has normal distribution.
#### Use Shapiro-Wilk test for this purpose. Use the p-value with a threshold of 0.05 to determine whether $H_0$ can be rejected with statistical significance or not. 

#### Use shapiro from scipy.stats to perform this test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html)

In [None]:
# YOUR CODE BELOW

### Question 4.2
#### Determine if there is any statistical diﬀerence in number of events between the identified event groups. Perform this test on all pairs of event groups. 
#### Null hypothesis - $H_0$: Any two event group come from the same population.
#### If $H_0$ is rejected in Shapiro-Wilk test (at least for one of the two event groups considered for test), use the Mann-Whitney U statistical test for this purpose. If $H_0$ is not rejected in Shapiro-Wilk test, use the independent t-test for this purpose.  
#### In any case, use the p-value with a threshold of 0.01 to determine whether $H_0$ can be rejected with statistical significance. 

#### Use mannwhitneyu from scipy.stats to perform Mann-Whitney U test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html) 
#### or 
#### ttest_ind from scipy.stats to perform independent t-test https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html.

#### More information:

You can pass the following arguments to mannwhitneyu from scipy.stats - (method='exact', nan_policy='omit'). For ttest_ind you can use - (nan_policy='omit')

In [31]:
# YOUR CODE BELOW

### Question 4.3
#### Each time you reject the null hypothesis $H_0$, quantify the effect size of the diﬀerence between the groups using cliﬀ’s delta ($\delta$). 
#### To calculate cliﬀ’s delta, you can pass the list of values to cliﬀ delta.py file given in the repository. E.g., cliffsDelta.cliffsDelta(list of values, list of values). This will return the effect size.
#### Refer to the table given in the TP document and mention your interpretation (negligible, small, medium, large).

In [None]:
# YOU CODE BELOW