In [2]:
import requests
import joblib
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pandas import option_context
from pywin.framework.app import contributors
from tqdm import tqdm
from statsmodels.stats.inter_rater import aggregate_raters
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mannwhitneyu,shapiro,ttest_ind

import cliffsDelta as cd
import GenerateActivities as gat
import important_features as imf

In [3]:
QUERY_ROOT = "https://api.github.com"
TOKEN = '' # write your GitHub API key here
HEADERS={'Authorization':'token '+TOKEN}

## Section 3

### Question 1

#### Write an automated Python script to identify all GitHub bot actors in the list of contributors, by querying the GitHub REST API users endpoint and extracting the required information to make this decision. 

#### More information: You can search for the "type" key in the obtianed JSON result, If it is "Bot", then it is a bot actor, if it is "User", then it is an account.

In [5]:
# 1. Read the given <sample>.csv file into a pandas DataFrame (df).
# 2. Make sure there are 2 columns, one column name is "contributor" that has the name of the contributor, 
#    and another column name is "bothunter_type" that you will use later on in this assignment.
# 3. Display the DataFrame.

# YOUR CODE BELOW
df = pd.read_csv("Contributors/sample8.csv")
df.head()

Unnamed: 0,contributor,bothunter_type
0,Armavica,Human
1,EmilyXinyi,Human
2,GPUtester,Human
3,JuliaTagBot,Bot
4,MarcBerliner,Human


In [6]:
# 1. Store the names of the contributors in a list
# 2. print the list

# YOUR CODE BELOW
contributors = df['contributor'].tolist()
contributors

['Armavica',
 'EmilyXinyi',
 'GPUtester',
 'JuliaTagBot',
 'MarcBerliner',
 'SaimMomin12',
 'ViralBShah',
 'allcontributors[bot]',
 'alxbnct',
 'avik-pal',
 'bioc-workshop-dev',
 'bpinsard',
 'codecov-commenter',
 'cproctor',
 'dependabot[bot]',
 'drammock',
 'fjetter',
 'github-merge-queue[bot]',
 'hayesla',
 'jacobtomlinson',
 'jni',
 'kanishkan91',
 'leofang',
 'lumberbot-app[bot]',
 'martey',
 'mergify[bot]',
 'minhqdao',
 'natefoo',
 'netlify[bot]',
 'oscardssmith',
 'pmeier',
 'rwegener2',
 'sklearn-ci',
 'soupault',
 'stan-buildbot',
 'tardis-bot',
 'timtreis',
 'welcome[bot]',
 'wm75',
 'zeptodoctor']

In [8]:
# 1. Using a for loop, iterate over each contributor in the list and query the GitHub Users API.
#    You can use "query = f'{QUERY_ROOT}/users/{contributor}'", where QUERY_ROOT is defined at the beginning of this notebook 
#    and 'contributor' is each individual contributor from the list
# 2. Get the response using 'response = requests.get(query, headers=HEADERS)'.
# 3. convert the response to JSON using 'json_response = response.json()'.
# 4. Iterate over each JSON response and get the value of the 'type' key. If it is "Bot" then the contributor is a bot actor, 
#    if "User" then the contributor is an account. You should boolean values to indicate if the contributor is a bot actor (True) or User/Organisation (False)
# 5. Save these results in list of dictionary of the form [{'contributor': <contributor name>, 'app': <boolean value>}, {...}, {...}, {...}].
#    Lets call this list as "app_dict"
# 6. Finally convert this list of dictionary to DataFrame by writing 'pd.DataFrame.from_dict(app_dict)'
# 7. Display the DataFrame. This should have two columns - contributor and app

# YOUR CODE BELOW
app_dict = []
for contributor in contributors:
    query = f'{QUERY_ROOT}/users/{contributor}'
    response = requests.get(query, headers=HEADERS)
    json_response = response.json()
    if json_response['type'] == 'Bot':
        app_dict.append({'contributor': contributor, 'app': True})
    else:
        app_dict.append({'contributor': contributor, 'app': False})
app_df = pd.DataFrame.from_dict(app_dict)
app_df

Unnamed: 0,contributor,app
0,Armavica,False
1,EmilyXinyi,False
2,GPUtester,False
3,JuliaTagBot,False
4,MarcBerliner,False
5,SaimMomin12,False
6,ViralBShah,False
7,allcontributors[bot],True
8,alxbnct,False
9,avik-pal,False


### Question 2

#### Add a new 'actor' column in the CSV file to store this information. You can write True if it is a bot actor and False if it is not.

#### Report on the total number of GitHub bot actors, and User accounts present in the list of accounts of your dataset.

In [9]:
# Merge the app DataFrame to df by writing 'pd.merge(df, <app df>, on='contributor'). This is similar to SQL join on primary key 'contributor'.
# The resultant df should have 3 columns - contributor, bothunter_type and user_type.

# YOUR CODE BELOW
df = pd.merge(df, app_df, on='contributor')
df

Unnamed: 0,contributor,bothunter_type,app
0,Armavica,Human,False
1,EmilyXinyi,Human,False
2,GPUtester,Human,False
3,JuliaTagBot,Bot,False
4,MarcBerliner,Human,False
5,SaimMomin12,Human,False
6,ViralBShah,Human,False
7,allcontributors[bot],Bot,True
8,alxbnct,Human,False
9,avik-pal,Human,False


In [10]:
#change the column name to user_type
df.rename(columns={'app':'user_type'}, inplace=True)

In [11]:
df

Unnamed: 0,contributor,bothunter_type,user_type
0,Armavica,Human,False
1,EmilyXinyi,Human,False
2,GPUtester,Human,False
3,JuliaTagBot,Bot,False
4,MarcBerliner,Human,False
5,SaimMomin12,Human,False
6,ViralBShah,Human,False
7,allcontributors[bot],Bot,True
8,alxbnct,Human,False
9,avik-pal,Human,False


### Question 3

#### For each bot actor, write their purpose, the task that they automate by looking at their GitHub profile, homepage, information on the GitHub Marketplace, or other documentation that you can find online.


In [12]:
#For each bot actor, write their purpose, the task that they automate by looking at their GitHub profile, homepage, information on the GitHub Marketplace, or other documentation that you can find online
# YOUR CODE BELOW
#Show the bot actor
df[df['user_type']==True]

Unnamed: 0,contributor,bothunter_type,user_type
7,allcontributors[bot],Bot,True
14,dependabot[bot],Bot,True
17,github-merge-queue[bot],Bot,True
23,lumberbot-app[bot],Bot,True
25,mergify[bot],Bot,True
28,netlify[bot],Bot,True
37,welcome[bot],Bot,True


allcontributors[bot]: a bot to automate the tedious stuff for adding project contributors

dependabot[bot]: Inform you about vulnerabilities in the dependencies that you use in your repository

github-merge-queue[bot]: helps increase velocity by automating pull request merges into a busy branch and ensuring the branch is never broken by incompatible changes

lumberbot-app[bot]: @scientific-python app bot that responds to commands, backport Pull-Request, give fine grained permissions to users.

mergify[bot]: a powerful, innovative tool designed to automate your GitHub workflows

netlify[bot]: detects your changes and triggers automated deploys, functions, and more across our global CDN

welcome[bot]: a GitHub App that welcomes new users to your repository



## Section 4

### Question 1

#### Write an automated script to use BIMBAS (Bot Identification Model Based on Activity Sequences) to obtain a prediction of the GitHub contributor type (Bot or Human) for the contributors present in your dataset.

#### Folow the steps given in each cell to use BIMBAS and obtain predictions.

### Read all the events and select the events performed by the contributors present in the given set

In [14]:
# 1. Read the csv file of events (all_events.csv) provided along with this notebook
# 2. Convert created_at column to datetime format 
#    One possible way is to use lambda function: "events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))"
# 3. Get the contributors provided to you in a list or use the contributors list that you created in Section 3 Question 1. 
#    e.g., ['contributor1', 'contributor2', 'contributor3',...]
# 4. Select the events performed by these contributors alone. Do not consider all the events. You should consider only the events
#    performed by the contributors provided to you. Hint: you can use the df.query function
# 5. Display the considered events.

# YOUR CODE BELOW
events = pd.read_csv("all_events.csv")
events = events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))
selected_events = events[events['login'].isin(contributors)]
selected_events


Unnamed: 0.1,Unnamed: 0,event_id,event_type,login,repository,created_at,action,PR_number,state,PR_node_id,...,comment_node_id,merged,ref,ref_type,issue_number,issue_node_id,issue_closed_at,tag_name,release_node_id,org
29,29,39747452010,IssueCommentEvent,JuliaTagBot,SciML/CommonWorldInvalidations.jl,2024-07-01 00:07:04,created,,closed,,...,IC_kwDOMOEgaM6DD0Tk,,,,5.0,I_kwDOMOEgaM6N6wg8,2024-06-28T18:51:28Z,,,SciML
49,49,39747566230,PullRequestEvent,jni,napari/napari,2024-07-01 00:17:47,closed,7041.0,closed,PR_kwDOCJ0aI85z-iY3,...,,True,,,,,,,,napari
50,50,39747566306,PushEvent,jni,napari/napari,2024-07-01 00:17:47,,,,,...,,,,,,,,,,napari
64,64,39747593861,IssueCommentEvent,netlify[bot],QuantEcon/lecture-python-intro,2024-07-01 00:20:10,created,493.0,open,PR_kwDOIy5DFs50AAal,...,IC_kwDOIy5DFs6DD2NY,,,,,,,,,QuantEcon
163,163,39747766396,CreateEvent,dependabot[bot],SciML/EasyModelAnalysis.jl,2024-07-01 00:34:53,,,,,...,,,dependabot/github_actions/actions/add-to-proje...,branch,,,,,,SciML
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358380,358380,42415246599,PushEvent,avik-pal,SciML/LineSearch.jl,2024-09-30 23:20:39,,,,,...,,,,,,,,,,SciML
358381,358381,42415246464,PullRequestEvent,avik-pal,SciML/LineSearch.jl,2024-09-30 23:20:39,closed,7.0,closed,PR_kwDOLCAB0M579DAU,...,,True,,,,,,,,SciML
358382,358382,42415246730,DeleteEvent,avik-pal,SciML/LineSearch.jl,2024-09-30 23:20:40,,,,,...,,,ap/start,branch,,,,,,SciML
358383,358383,42415258702,CommitCommentEvent,avik-pal,SciML/LineSearch.jl,2024-09-30 23:21:23,,,,,...,CC_kwDOLCAB0M4IyT98,,,,,,,,,SciML


### Execute BIMBAS to obtain the predictions

In [15]:
# Do not modify this cell
# Use the following function the code below to obtain your predictions

def execute_bimbas(contributors_list, selected_events):
    '''
    args: contributors_list (list) - list of contributors in the dataset provided to you
          selected_events (DataFrame) - DataFrame of events performed by the considered contributors
    return: bimbas_prediction (DataFrame) - DataFrame of preditions along with the confidence in prediction for each contributor
    '''
    result=pd.DataFrame()
    temp_list = []
    bimbas = joblib.load('bimbas.joblib')
    date_limit = pd.to_datetime(selected_events.created_at.max()) + pd.DateOffset(-90)
    
    for contributor in tqdm(contributors_list):
        # c = contributors[contributor]
        activities = gat.activity_identification(selected_events.query('login==@contributor and created_at>=@date_limit'))
        activity_features = (
                            imf.extract_features(activities)
                            .set_index([[contributor]])
                            )
        # features = pd.concat([features,activity_features])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            probability = bimbas.predict_proba(activity_features)[0][1]
        if(probability <= 0.5):
            contributor_type = 'Human'
        else:
            contributor_type = 'Bot'
        confidence = (abs(probability - 0.5)*2).round(3)
    
        pred = activity_features.set_index([[contributor]]).assign(
            prediction=contributor_type,
            confidence = confidence,
            )
        temp_list.extend([{'contributor':contributor,'bimbas_type':contributor_type,'confidence':confidence}])
    
    bimbas_prediction = pd.DataFrame.from_dict(temp_list)

    return(bimbas_prediction)

In [16]:
# Write your code here to call execute_bimbas function. 
# Pass the contributors list and the filtered events DataFrame as argument to "execute_bimbas" function
# Prediction returned by BIMBAS will have "contributor", "bimbas_type" and "confidence"
# Note: Print the prediction provided by BIMBAS - else this cell will not be graded.

# YOUR CODE BELOW
bimbas_predictions = execute_bimbas(contributors, selected_events)
bimbas_predictions

100%|██████████| 40/40 [00:06<00:00,  5.72it/s]


Unnamed: 0,contributor,bimbas_type,confidence
0,Armavica,Human,0.958
1,EmilyXinyi,Human,0.817
2,GPUtester,Bot,0.841
3,JuliaTagBot,Bot,0.345
4,MarcBerliner,Human,0.85
5,SaimMomin12,Human,0.771
6,ViralBShah,Human,0.903
7,allcontributors[bot],Bot,0.567
8,alxbnct,Bot,0.807
9,avik-pal,Human,0.648


### Create a column in your predictions DataFrame to have predictions provided by BIMBAS. 
### Now your DataFrame should have the following columns - contributor, bothunter_type, app, bimbas_type, confidence 

### Merging bothunter and bimbas predictions

In [17]:
# Merge the predictions provided by BIMBAS to the resultant dataset of Section 3. This final DataFrame should have the following columns - 
# contributor, bothunter_type, user_type, bimbas_type, and confidence.
# YOUR CODE BELOW
df = pd.merge(df, bimbas_predictions, on='contributor')
df

Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence
0,Armavica,Human,False,Human,0.958
1,EmilyXinyi,Human,False,Human,0.817
2,GPUtester,Human,False,Bot,0.841
3,JuliaTagBot,Bot,False,Bot,0.345
4,MarcBerliner,Human,False,Human,0.85
5,SaimMomin12,Human,False,Human,0.771
6,ViralBShah,Human,False,Human,0.903
7,allcontributors[bot],Bot,True,Bot,0.567
8,alxbnct,Human,False,Bot,0.807
9,avik-pal,Human,False,Human,0.648


### Question 2
#### Using Cohen's Kappa compute and report the interrater agreement score between the labels computed by both the bot identification approaches. Mention your interpretation of Cohen's Kappa

In [18]:
# Hint: use the cohen_kappa_score library
# YOUR CODE BELOW
cohen_kappa_score(df['bothunter_type'], df['bimbas_type'])

np.float64(0.46380697050938335)

0.46: Moderate agreement

### Question 3
#### Determine the final type of each contributor. Whenever user_type column has the value "User" check if both bimbas_type and bothunter_type give the same prediction, then consider it as your final prediction in "acc_type" column.
#### For the contributors that have different predictions, i.e. bimbas_type is not same as bothunter_type, then make a manual verification and consider that as the type for that contributor. Add an extra column to the DataFrame ﬁle called 'manual' that has the label determined by you. 
#### For manual verification you can make use the GitHub UI and the GitHub API https://api.github.com/users/username

#### Whenever user_type column has the value "Bot", directly write your final acc_type as "Bot Actor"

#### More information:
For each contributor, you can look at their activities in GitHub UI, look at their latest events using GitHub Events API - https://api.github.com/users/<contributor>/events, to make a decision on their type.

In addition, you can also write a very small reason why do you think they are bot or human (e.g., same activity at regular interval so Bot, files committed and code modified looks like Human, comments look like Human, comment look like Bot, test report looks automated so Bot, and so on....). This will be useful to answer your next question.

In [22]:
# Write the code for your manual decision here
# example : 
# manual_list_dict = [{'contributor': '<contributor name>', 'manual': '<your prediction>'}, {....}, {....}, {....}]
# pd.DataFrame.from_dict(manual_list_dict)

# YOUR CODE BELOW
manual_list_dict = []
for index, row in df.iterrows():
    if row['user_type']:
        manual_list_dict.append({'contributor': row['contributor'], 'manual': 'Bot Actor'})
    elif row['bothunter_type'] == row['bimbas_type']:
        manual_list_dict.append({'contributor': row['contributor'], 'manual': row['bothunter_type']})
    else:
        manual_list_dict.append({'contributor': row['contributor'], 'manual': 'verification'})
manual_df = pd.DataFrame.from_dict(manual_list_dict)
#show all verification
manual_df[manual_df['manual']=='verification']


Unnamed: 0,contributor,manual
2,GPUtester,verification
8,alxbnct,verification
10,bioc-workshop-dev,verification
13,cproctor,verification
21,kanishkan91,verification
31,rwegener2,verification
34,stan-buildbot,verification
35,tardis-bot,verification
39,zeptodoctor,verification


In [28]:
#appel Api pour verifier les utilisateurs
def check_user_type(contributor):
    query = f'{QUERY_ROOT}/users/{contributor}'
    response = requests.get(query, headers=HEADERS)
    json_response = response.json()
    return json_response

verification_list = manual_df[manual_df['manual']=='verification']['contributor'].tolist()
verification_list


['GPUtester',
 'alxbnct',
 'bioc-workshop-dev',
 'cproctor',
 'kanishkan91',
 'rwegener2',
 'stan-buildbot',
 'tardis-bot',
 'zeptodoctor']

In [32]:
for contributor in verification_list:
	user_type = check_user_type(contributor)
	print(contributor, user_type)

GPUtester {'login': 'GPUtester', 'id': 38199262, 'node_id': 'MDQ6VXNlcjM4MTk5MjYy', 'avatar_url': 'https://avatars.githubusercontent.com/u/38199262?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/GPUtester', 'html_url': 'https://github.com/GPUtester', 'followers_url': 'https://api.github.com/users/GPUtester/followers', 'following_url': 'https://api.github.com/users/GPUtester/following{/other_user}', 'gists_url': 'https://api.github.com/users/GPUtester/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/GPUtester/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/GPUtester/subscriptions', 'organizations_url': 'https://api.github.com/users/GPUtester/orgs', 'repos_url': 'https://api.github.com/users/GPUtester/repos', 'events_url': 'https://api.github.com/users/GPUtester/events{/privacy}', 'received_events_url': 'https://api.github.com/users/GPUtester/received_events', 'type': 'User', 'user_view_type': 'public', 'site_admin': False, 'name':

GPUtester is a bot because it has a lot of activity in a short period of time and the activity is repetitive

alxbnct isn't a bot because he has his name and surname on his profile and he also has public repositories about exercises

bioc-workshop-dev is a bot because all his action begins with "DO NOT INCLUDE REQUESTS IN THE FIRST COMMENT.
PLEASE POST THIS TEMPLATE UNCHANGED THEN FOLLOW ITS INSTRUCTIONS IN A NEW
COMMENT"

cproctor isn't a bot because in his profil bio, we have "Assistant Professor of Learning Sciences @ University at Buffalo. Studying critical computational literacies and K-12 CS education."

kanishkan91 isn't a bot because in his profil bio, we have "Computational Scientist. Quantitative analyst, statistical modeler and web app developer. Python, R, D3 user."

rwegener2 isn't a bot because she has her name and surname on her profile and she also put her face on her profile picture

stan-buildbot is a bot it's in its name

tardis-bot is a bot it's in its name

zeptodoctor is a bot because there isn't any activity in his profile

In [35]:
manual_df.loc[manual_df['contributor']=='GPUtester','manual'] = 'Bot'
manual_df.loc[manual_df['contributor']=='alxbnct','manual'] = 'Human'
manual_df.loc[manual_df['contributor']=='bioc-workshop-dev','manual'] = 'Bot'
manual_df.loc[manual_df['contributor']=='cproctor','manual'] = 'Human'
manual_df.loc[manual_df['contributor']=='kanishkan91','manual'] = 'Human'
manual_df.loc[manual_df['contributor']=='rwegener2','manual'] = 'Human'
manual_df.loc[manual_df['contributor']=='stan-buildbot','manual'] = 'Bot'
manual_df.loc[manual_df['contributor']=='tardis-bot','manual'] = 'Bot'
manual_df.loc[manual_df['contributor']=='zeptodoctor','manual'] = 'Bot'

In [36]:
manual_df

Unnamed: 0,contributor,manual
0,Armavica,Human
1,EmilyXinyi,Human
2,GPUtester,Bot
3,JuliaTagBot,Bot
4,MarcBerliner,Human
5,SaimMomin12,Human
6,ViralBShah,Human
7,allcontributors[bot],Bot Actor
8,alxbnct,Human
9,avik-pal,Human


In [37]:
# Write your code below here to determine the final type of contributor.
# add column named 'acc_type' and write your final decision in it. Final decision is majority of three types - bimbas_type, bothunter_type, manual.
# YOUR CODE BELOW

df = pd.merge(df, manual_df, on='contributor')
df['acc_type'] = df[['bothunter_type', 'bimbas_type', 'manual']].mode(axis=1)[0]
df


Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence,manual,acc_type
0,Armavica,Human,False,Human,0.958,Human,Human
1,EmilyXinyi,Human,False,Human,0.817,Human,Human
2,GPUtester,Human,False,Bot,0.841,Bot,Bot
3,JuliaTagBot,Bot,False,Bot,0.345,Bot,Bot
4,MarcBerliner,Human,False,Human,0.85,Human,Human
5,SaimMomin12,Human,False,Human,0.771,Human,Human
6,ViralBShah,Human,False,Human,0.903,Human,Human
7,allcontributors[bot],Bot,True,Bot,0.567,Bot Actor,Bot
8,alxbnct,Human,False,Bot,0.807,Human,Human
9,avik-pal,Human,False,Human,0.648,Human,Human


### Question 4
#### Study and report the purpose of these identified bots and humans in the repository assigned to you.
#### For example, the purpose can be based on but not limited to - 
##### 1) the type of activities that they are performing (releasing a version on every Sunday, updating the documentation), 
##### 2) type of comments they are posting (reviewing code, test summary report), 
##### 3) when are they triggered (when a new PR is created, when someone in the project ask the bot to merge the code) and so on.  
#### Note: 
##### The purpose can be other than what is provided in examples above.

In [38]:
identified_list = df['contributor'].tolist()
for contributor in identified_list:
	user_type = check_user_type(contributor)
	print(contributor, user_type)

Armavica {'login': 'Armavica', 'id': 5855503, 'node_id': 'MDQ6VXNlcjU4NTU1MDM=', 'avatar_url': 'https://avatars.githubusercontent.com/u/5855503?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/Armavica', 'html_url': 'https://github.com/Armavica', 'followers_url': 'https://api.github.com/users/Armavica/followers', 'following_url': 'https://api.github.com/users/Armavica/following{/other_user}', 'gists_url': 'https://api.github.com/users/Armavica/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/Armavica/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/Armavica/subscriptions', 'organizations_url': 'https://api.github.com/users/Armavica/orgs', 'repos_url': 'https://api.github.com/users/Armavica/repos', 'events_url': 'https://api.github.com/users/Armavica/events{/privacy}', 'received_events_url': 'https://api.github.com/users/Armavica/received_events', 'type': 'User', 'user_view_type': 'public', 'site_admin': False, 'name': 'Virgile Andre

## Section 5

### Question 1
#### Use the filtered events file that has the events performed by the contributors provided to you
#### Group each event into the following four categories

Issues: IssueCommentEvent, IssuesEvent  
Pull Requests: PullRequestEvent, PullRequestReviewCommentEvent  
Commits: CommitCommentEvent, PushEvent  
Repository: CreateEvent, DeleteEvent, ForkEvent, GollumEvent, MemberEvent, PublicEvent, ReleaseEvent, SponsorshipEvent, WatchEvent  

#### Hint:
1. Add a column called event_group that mentions which event group does that event belong to. Each event (row) should correspond to an event group.  
2. Then perform groupby on ['login','event_group'],  
3. use .agg (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html) to count the number of events performed by each contributor in each group,  
4. use pivot with the required arguments (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html). An example is given [here](#pivot_example) in question 2(b), and
5. Reset index and rename axis with None
7. Finally merge it with your 'acc_type' field from the DataFrame you created in Section4, drop contributor, and fillna with 0
8. The final DataFrame should have the following columns - login, PR, commit, issue, repo, acc_type

The final DataFrame should be looking like the following  
![](event_group.png)

In [18]:
# In the resultant DataFrame - each row should corespond to a contributor, and the columns should have all the event groups  
# and the type of the contributor (that you decided in the previous DataFrame) and the values should be the number 
# of events of that event event group the contriubutor has performed.

issue_group = ['IssueCommentEvent', 'IssuesEvent']
pr_group = ['PullRequestEvent', 'PullRequestReviewCommentEvent']
commit_group = ['CommitCommentEvent', 'PushEvent']
repo_group = ['CreateEvent', 'DeleteEvent', 'ForkEvent', 'GollumEvent', 'MemberEvent', 'PublicEvent', 'ReleaseEvent', 'SponsorshipEvent', 'WatchEvent']

In [19]:
# YOUR CODE BELOW


In [20]:
# YOUR CODE BELOW


### Question 2 (a)

#### Compute the median number of events per event group for Bot+Bot actors and Humans and write in DataFrame.

Row should correspond to type (Bot_BotActor and Human), Column should have Event group name and the values should be the median value of Bot_BotActor or Human for that particular event group. An example is given below

In [15]:
# For example:
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot_app'}, 
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot_app
1,event_group1,val2,Human
2,event_group2,val3,Bot_app
3,event_group2,val4,Human
4,event_group3,val5,Bot_app
5,event_group3,val6,Human
6,event_group4,val7,Bot_app
7,event_group5,val8,Human


In [22]:
# YOUR CODE BELOW

### Question 2 (b)

Plot a heatmap of the DataFrame using seaborn - 
1. First convert the dataframe to the required format using pivot, example is given below
2. plot using seaborn - sns.heatmap(df_medians, annot=True, vmin=0, vmax=300, cmap="crest"). More details: https://seaborn.pydata.org/generated/seaborn.heatmap.html)

#### pd.pivot example:
<a id='pivot_example'></a>

In [17]:
# Main DataFrame
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot'}, 
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot
1,event_group1,val2,Human
2,event_group2,val3,Bot
3,event_group2,val4,Human
4,event_group3,val5,Bot
5,event_group3,val6,Human
6,event_group4,val7,Bot
7,event_group5,val8,Human


In [18]:
# pivot the main DataFrame
df_medians.pivot(index='acc_type', columns='event_group', values='median')#.reset_index().rename_axis(None,axis=1)

event_group,event_group1,event_group2,event_group3,event_group4,event_group5
acc_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bot,val1,val3,val5,val7,
Human,val2,val4,val6,,val8


In [25]:
# YOUR CODE BELOW

In [26]:
# YOUR CODE BELOW


#### What is the difference that you observe between Bots+Bot actors and Humans?

#### What is the differnce that you observe between Event groups?

#### What is the difference that you observe between Bots+Bot actors and Humans and Event groups all considering at the same time?

### Question 3

#### Create boxen plots to visualise the distribution of number of events in each event group. 
#### For more information you can visit - https://seaborn.pydata.org/generated/seaborn.boxenplot.html#seaborn.boxenplot 
#### You should highlight the data points that correspond to bots using a stripplot in seaborn. https://seaborn.pydata.org/tutorial/categorical.html#categorical-tutorial  
#### Interpret the results of the visualisation.

In [27]:
# YOUR CODE BELOW - Visualize number of events from pull request event group


In [28]:
# YOUR CODE BELOW - Visualise number of events from issue event group


In [29]:
# YOUR CODE BELOW - Visualise number of events from commit event group


In [30]:
# YOUR CODE BELOW - Visualise number of events from repo event group


### Question 4.1

#### Statistical identify whether number of events in each event group is normally distributed or not.
#### Null hypothesis - $H_0$: Sample comes from the data that has normal distribution.
#### Use Shapiro-Wilk test for this purpose. Use the p-value with a threshold of 0.05 to determine whether $H_0$ can be rejected with statistical significance or not. 

#### Use shapiro from scipy.stats to perform this test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html)

In [None]:
# YOUR CODE BELOW

### Question 4.2
#### Determine if there is any statistical diﬀerence in number of events between the identified event groups. Perform this test on all pairs of event groups. 
#### Null hypothesis - $H_0$: Any two event group come from the same population.
#### If $H_0$ is rejected in Shapiro-Wilk test (at least for one of the two event groups considered for test), use the Mann-Whitney U statistical test for this purpose. If $H_0$ is not rejected in Shapiro-Wilk test, use the independent t-test for this purpose.  
#### In any case, use the p-value with a threshold of 0.01 to determine whether $H_0$ can be rejected with statistical significance. 

#### Use mannwhitneyu from scipy.stats to perform Mann-Whitney U test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html) 
#### or 
#### ttest_ind from scipy.stats to perform independent t-test https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html.

#### More information:

You can pass the following arguments to mannwhitneyu from scipy.stats - (method='exact', nan_policy='omit'). For ttest_ind you can use - (nan_policy='omit')

In [31]:
# YOUR CODE BELOW

### Question 4.3
#### Each time you reject the null hypothesis $H_0$, quantify the effect size of the diﬀerence between the groups using cliﬀ’s delta ($\delta$). 
#### To calculate cliﬀ’s delta, you can pass the list of values to cliﬀ delta.py file given in the repository. E.g., cliffsDelta.cliffsDelta(list of values, list of values). This will return the effect size.
#### Refer to the table given in the TP document and mention your interpretation (negligible, small, medium, large).

In [None]:
# YOU CODE BELOW