In [299]:
import requests
import joblib
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pandas import option_context
from tqdm import tqdm
from statsmodels.stats.inter_rater import aggregate_raters
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mannwhitneyu,shapiro,ttest_ind

import cliffsDelta as cd
import GenerateActivities as gat
import important_features as imf

In [None]:
QUERY_ROOT = "https://api.github.com"
TOKEN = '' # write your GitHub API key here
HEADERS={'Authorization':'token '+TOKEN}

## Section 3

### Question 1

#### Write an automated Python script to identify all GitHub bot actors in the list of contributors, by querying the GitHub REST API users endpoint and extracting the required information to make this decision. 

#### More information: You can search for the "type" key in the obtianed JSON result, If it is "Bot", then it is a bot actor, if it is "User", then it is an account.

In [301]:
# 1. Read the given <sample>.csv file into a pandas DataFrame (df).
# 2. Make sure there are 2 columns, one column name is "contributor" that has the name of the contributor, 
#    and another column name is "bothunter_type" that you will use later on in this assignment.
# 3. Display the DataFrame.

# YOUR CODE BELOW
df = pd.read_csv('Contributors/sample11.csv')
if len(df.columns) != 2:
    raise ValueError("There should be 2 columns")
df


Unnamed: 0,contributor,bothunter_type
0,Azaya89,Human
1,DilumAluthgeBot,Human
2,ForgottenProgramme,Human
3,JuliaRegistrator,Bot
4,MarcoGorelli,Human
5,Sch-Da,Human
6,Wesley-yang,Human
7,amontoison,Human
8,ax3l,Human
9,bioc-issue-bot,Bot


In [302]:
# 1. Store the names of the contributors in a list
contributors = []
for c in df["contributor"]:
    contributors.append(c)
# 2. print the list
# YOUR CODE BELOW
print(contributors)

['Azaya89', 'DilumAluthgeBot', 'ForgottenProgramme', 'JuliaRegistrator', 'MarcoGorelli', 'Sch-Da', 'Wesley-yang', 'amontoison', 'ax3l', 'bioc-issue-bot', 'brendan-ward', 'chainer-ci', 'codspeed-hq[bot]', 'commit-0', 'coveralls', 'csoneson', 'drvinceknight', 'fonnesbeck', 'galaxyproject-sentryintegration[bot]', 'github-actions[bot]', 'github-advanced-security[bot]', 'hendrikmakait', 'jakirkham', 'jondo', 'kbevers', 'lithomas1', 'martindurant', 'miriamkw', 'nedtaylor', 'owenlittlejohns', 'pr0m1th3as', 'pre-commit-ci[bot]', 'renovate[bot]', 'review-notebook-app[bot]', 'saldanhad', 'spholmes', 'sympy-bot', 'tomdonaldson', 'weblate', 'wsmoses']


In [303]:
app_dict = []

for contributor in contributors:
    # 1. Using a for loop, iterate over each contributor in the list and query the GitHub Users API.
    #    You can use "query = f'{QUERY_ROOT}/users/{contributor}'", where QUERY_ROOT is defined at the beginning of this notebook 
    #    and 'contributor' is each individual contributor from the list
    query = f'{QUERY_ROOT}/users/{contributor}'
    # 2. Get the response using 'response = requests.get(query, headers=HEADERS)'.
    response = requests.get(query, headers=HEADERS)
    # 3. convert the response to JSON using 'json_response = response.json()'.
    json_response = response.json()
    # 4. Iterate over each JSON response and get the value of the 'type' key. If it is "Bot" then the contributor is a bot actor, 
    #    if "User" then the contributor is an account. You should boolean values to indicate if the contributor is a bot actor (True) or User/Organisation (False)
    is_bot = json_response["type"] == "Bot"
    # 5. Save these results in list of dictionary of the form [{'contributor': <contributor name>, 'app': <boolean value>}, {...}, {...}, {...}].
    #    Lets call this list as "app_dict"
    app_dict.append({'contributor': contributor, "app": is_bot, "user_type": json_response["type"]})

# 6. Finally convert this list of dictionary to DataFrame by writing 'pd.DataFrame.from_dict(app_dict)'
pd_app_dict = pd.DataFrame.from_dict(app_dict)
# 7. Display the DataFrame. This should have two columns - contributor and app
pd_app_dict

Unnamed: 0,contributor,app,user_type
0,Azaya89,False,User
1,DilumAluthgeBot,False,User
2,ForgottenProgramme,False,User
3,JuliaRegistrator,False,User
4,MarcoGorelli,False,User
5,Sch-Da,False,User
6,Wesley-yang,False,User
7,amontoison,False,User
8,ax3l,False,User
9,bioc-issue-bot,False,User


### Question 2

#### Add a new 'actor' column in the CSV file to store this information. You can write True if it is a bot actor and False if it is not.

#### Report on the total number of GitHub bot actors, and User accounts present in the list of accounts of your dataset.

In [304]:
# Merge the app DataFrame to df by writing 'pd.merge(df, <app df>, on='contributor'). This is similar to SQL join on primary key 'contributor'.
# The resultant df should have 3 columns - contributor, bothunter_type and user_type.
contributors_pd = pd.merge(df, pd_app_dict, on='contributor')
# YOUR CODE BELOW
bots = contributors_pd.loc[contributors_pd["app"] == True]
print("Nb of bots: " + str(len(bots)))
contributors_pd = contributors_pd.drop(columns="app")
contributors_pd

Nb of bots: 7


Unnamed: 0,contributor,bothunter_type,user_type
0,Azaya89,Human,User
1,DilumAluthgeBot,Human,User
2,ForgottenProgramme,Human,User
3,JuliaRegistrator,Bot,User
4,MarcoGorelli,Human,User
5,Sch-Da,Human,User
6,Wesley-yang,Human,User
7,amontoison,Human,User
8,ax3l,Human,User
9,bioc-issue-bot,Bot,User


### Question 3

#### For each bot actor, write their purpose, the task that they automate by looking at their GitHub profile, homepage, information on the GitHub Marketplace, or other documentation that you can find online.


* codspeed-hq[bot] :
    - CodSpeed integrates into dev and CI workflows to measure performance, detect regressions, and enable actionable optimizations.
* galaxyproject-sentryintegration[bot] :
    - Command-line utilities to assist in developing Galaxy and Common Workflow Language artifacts - including tools, workflows, and training materials.
* github-actions[bot] :
    - Automate, customize, and execute your software development workflows right in your repository with GitHub Actions.
* github-advanced-security[bot] :
    - GitHub makes extra security features available to customers under an Advanced Security license. These features are also enabled for public repositories.
* pre-commit-ci[bot] :
    - Configure AWS credential environment variables for use in other GitHub Actions. 
* renovate[bot] :
    - Renovate is an automated dependency update tool. It helps to update dependencies in your code without needing to do it manually.
* review-notebook-app[bot] :
    - ReviewNB provides a quick way to view notebook diffs so reviewers can focus on the changes that matter.

## Section 4

### Question 1

#### Write an automated script to use BIMBAS (Bot Identification Model Based on Activity Sequences) to obtain a prediction of the GitHub contributor type (Bot or Human) for the contributors present in your dataset.

#### Folow the steps given in each cell to use BIMBAS and obtain predictions.

### Read all the events and select the events performed by the contributors present in the given set

In [305]:
# 1. Read the csv file of events (all_events.csv) provided along with this notebook
events = pd.read_csv('all_events.csv')
# 2. Convert created_at column to datetime format 
#    One possible way is to use lambda function: "events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))"
# 3. Get the contributors provided to you in a list or use the contributors list that you created in Section 3 Question 1. 
events = events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))
#    e.g., ['contributor1', 'contributor2', 'contributor3',...]
print(contributors)
# 4. Select the events performed by these contributors alone. Do not consider all the events. You should consider only the events
#    performed by the contributors provided to you. Hint: you can use the df.query function
events = events.loc[events["login"].isin(contributors)]
events = events.assign(created_at=lambda d: pd.to_datetime(d.created_at, unit='ms'))
# 5. Display the considered events.
events["event_type"]


['Azaya89', 'DilumAluthgeBot', 'ForgottenProgramme', 'JuliaRegistrator', 'MarcoGorelli', 'Sch-Da', 'Wesley-yang', 'amontoison', 'ax3l', 'bioc-issue-bot', 'brendan-ward', 'chainer-ci', 'codspeed-hq[bot]', 'commit-0', 'coveralls', 'csoneson', 'drvinceknight', 'fonnesbeck', 'galaxyproject-sentryintegration[bot]', 'github-actions[bot]', 'github-advanced-security[bot]', 'hendrikmakait', 'jakirkham', 'jondo', 'kbevers', 'lithomas1', 'martindurant', 'miriamkw', 'nedtaylor', 'owenlittlejohns', 'pr0m1th3as', 'pre-commit-ci[bot]', 'renovate[bot]', 'review-notebook-app[bot]', 'saldanhad', 'spholmes', 'sympy-bot', 'tomdonaldson', 'weblate', 'wsmoses']


6         CommitCommentEvent
15        CommitCommentEvent
16                 PushEvent
21               CreateEvent
30         IssueCommentEvent
                 ...        
358391             PushEvent
358397     IssueCommentEvent
358401             PushEvent
358403             PushEvent
358428             PushEvent
Name: event_type, Length: 96414, dtype: object

### Execute BIMBAS to obtain the predictions

In [306]:
# Do not modify this cell
# Use the following function the code below to obtain your predictions

def execute_bimbas(contributors_list, selected_events):
    '''
    args: contributors_list (list) - list of contributors in the dataset provided to you
          selected_events (DataFrame) - DataFrame of events performed by the considered contributors
    return: bimbas_prediction (DataFrame) - DataFrame of preditions along with the confidence in prediction for each contributor
    '''
    result=pd.DataFrame()
    temp_list = []
    bimbas = joblib.load('bimbas.joblib')
    date_limit = pd.to_datetime(selected_events.created_at.max()) + pd.DateOffset(-90)
    
    for contributor in tqdm(contributors_list):
        # c = contributors[contributor]
        activities = gat.activity_identification(selected_events.query('login==@contributor and created_at>=@date_limit'))
        activity_features = (
                            imf.extract_features(activities)
                            .set_index([[contributor]])
                            )
        # features = pd.concat([features,activity_features])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            probability = bimbas.predict_proba(activity_features)[0][1]
        if(probability <= 0.5):
            contributor_type = 'Human'
        else:
            contributor_type = 'Bot'
        confidence = (abs(probability - 0.5)*2).round(3)
    
        pred = activity_features.set_index([[contributor]]).assign(
            prediction=contributor_type,
            confidence = confidence,
            )
        temp_list.extend([{'contributor':contributor,'bimbas_type':contributor_type,'confidence':confidence}])
    
    bimbas_prediction = pd.DataFrame.from_dict(temp_list)

    return(bimbas_prediction)

In [307]:
# Write your code here to call execute_bimbas function. 
# Pass the contributors list and the filtered events DataFrame as argument to "execute_bimbas" function
# Prediction returned by BIMBAS will have "contributor", "bimbas_type" and "confidence"
# Note: Print the prediction provided by BIMBAS - else this cell will not be graded.
pred = execute_bimbas(contributors, events)
pred

100%|██████████| 40/40 [00:08<00:00,  4.79it/s]


Unnamed: 0,contributor,bimbas_type,confidence
0,Azaya89,Human,0.88
1,DilumAluthgeBot,Bot,0.894
2,ForgottenProgramme,Human,0.924
3,JuliaRegistrator,Bot,0.886
4,MarcoGorelli,Human,0.945
5,Sch-Da,Human,0.872
6,Wesley-yang,Bot,0.823
7,amontoison,Human,0.9
8,ax3l,Human,0.915
9,bioc-issue-bot,Bot,0.548


### Create a column in your predictions DataFrame to have predictions provided by BIMBAS. 
### Now your DataFrame should have the following columns - contributor, bothunter_type, app, bimbas_type, confidence 

### Merging bothunter and bimbas predictions

In [308]:
# Merge the predictions provided by BIMBAS to the resultant dataset of Section 3. This final DataFrame should have the following columns - 
# contributor, bothunter_type, user_type, bimbas_type, and confidence.
contributors_pd = pd.merge(contributors_pd, pred, on='contributor')
contributors_pd

Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence
0,Azaya89,Human,User,Human,0.88
1,DilumAluthgeBot,Human,User,Bot,0.894
2,ForgottenProgramme,Human,User,Human,0.924
3,JuliaRegistrator,Bot,User,Bot,0.886
4,MarcoGorelli,Human,User,Human,0.945
5,Sch-Da,Human,User,Human,0.872
6,Wesley-yang,Human,User,Bot,0.823
7,amontoison,Human,User,Human,0.9
8,ax3l,Human,User,Human,0.915
9,bioc-issue-bot,Bot,User,Bot,0.548


### Question 2
#### Using Cohen's Kappa compute and report the interrater agreement score between the labels computed by both the bot identification approaches. Mention your interpretation of Cohen's Kappa

In [309]:
# Hint: use the cohen_kappa_score library

md = cohen_kappa_score(contributors_pd["bothunter_type"], contributors_pd["bimbas_type"])
print("Interrater Agreement score: " + str(md))
# YOUR CODE BELOW

Interrater Agreement score: 0.6946564885496183


### Question 3
#### Determine the final type of each contributor. Whenever user_type column has the value "User" check if both bimbas_type and bothunter_type give the same prediction, then consider it as your final prediction in "acc_type" column.
#### For the contributors that have different predictions, i.e. bimbas_type is not same as bothunter_type, then make a manual verification and consider that as the type for that contributor. Add an extra column to the DataFrame ﬁle called 'manual' that has the label determined by you. 
#### For manual verification you can make use the GitHub UI and the GitHub API https://api.github.com/users/username

#### Whenever user_type column has the value "Bot", directly write your final acc_type as "Bot Actor"

#### More information:
For each contributor, you can look at their activities in GitHub UI, look at their latest events using GitHub Events API - https://api.github.com/users/<contributor>/events, to make a decision on their type.

In addition, you can also write a very small reason why do you think they are bot or human (e.g., same activity at regular interval so Bot, files committed and code modified looks like Human, comments look like Human, comment look like Bot, test report looks automated so Bot, and so on....). This will be useful to answer your next question.

In [310]:
# Write the code for your manual decision here
# example : 
# manual_list_dict = [{'contributor': '<contributor name>', 'manual': '<your prediction>'}, {....}, {....}, {....}]
# pd.DataFrame.from_dict(manual_list_dict)

# YOUR CODE BELOW

agree = contributors_pd.query("user_type == 'User' and bothunter_type == bimbas_type")
bot_sure = contributors_pd.query("user_type == 'Bot'")

man_to_review = contributors_pd.query("user_type == 'User' and bothunter_type != bimbas_type")

man_to_review

manual_list_dict = []
def add_user(contributor, prediction):
    manual_list_dict.append({"contributor": contributor, "manual": prediction})


add_user("DilumAluthgeBot", "Bot") # Uses bot emoji in front of each commit + has bots in this name
add_user("Wesley-yang", "Bot") # Has 1.7k repositories
add_user("drvinceknight", "Human") # Has a bio and pfp showing his face
add_user("miriamkw", "Human") # Has a pfp with a picture of herself and as a human behavior on her repositories
add_user("spholmes", "Human") # Has a description of his job, where he works

man_to_review = pd.merge(man_to_review, pd.DataFrame.from_dict(manual_list_dict), on='contributor')

agree

Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence
0,Azaya89,Human,User,Human,0.88
2,ForgottenProgramme,Human,User,Human,0.924
3,JuliaRegistrator,Bot,User,Bot,0.886
4,MarcoGorelli,Human,User,Human,0.945
5,Sch-Da,Human,User,Human,0.872
7,amontoison,Human,User,Human,0.9
8,ax3l,Human,User,Human,0.915
9,bioc-issue-bot,Bot,User,Bot,0.548
10,brendan-ward,Human,User,Human,0.897
11,chainer-ci,Bot,User,Bot,0.723


In [311]:
# Write your code below here to determine the final type of contributor.
# add column named 'acc_type' and write your final decision in it. Final decision is majority of three types - bimbas_type, bothunter_type, manual.
# Rename column
man_to_review = man_to_review.rename(columns={"manual": "acc_type"})

agree = agree.assign(acc_type=lambda x : x["bothunter_type"] == "Human")
agree["acc_type"] = np.where(agree["acc_type"] == True, "Human", "Both")
agree

bot_sure = bot_sure.assign(acc_type=lambda x : "Bot")
bot_sure

contributors_pd = pd.concat([agree, bot_sure, man_to_review])
contributors_pd

Unnamed: 0,contributor,bothunter_type,user_type,bimbas_type,confidence,acc_type
0,Azaya89,Human,User,Human,0.88,Human
2,ForgottenProgramme,Human,User,Human,0.924,Human
3,JuliaRegistrator,Bot,User,Bot,0.886,Both
4,MarcoGorelli,Human,User,Human,0.945,Human
5,Sch-Da,Human,User,Human,0.872,Human
7,amontoison,Human,User,Human,0.9,Human
8,ax3l,Human,User,Human,0.915,Human
9,bioc-issue-bot,Bot,User,Bot,0.548,Both
10,brendan-ward,Human,User,Human,0.897,Human
11,chainer-ci,Bot,User,Bot,0.723,Both


### Question 4
#### Study and report the purpose of these identified bots and humans in the repository assigned to you.
#### For example, the purpose can be based on but not limited to - 
##### 1) the type of activities that they are performing (releasing a version on every Sunday, updating the documentation), 
##### 2) type of comments they are posting (reviewing code, test summary report), 
##### 3) when are they triggered (when a new PR is created, when someone in the project ask the bot to merge the code) and so on.  
#### Note: 
##### The purpose can be other than what is provided in examples above.

## Section 5

### Question 1
#### Use the filtered events file that has the events performed by the contributors provided to you
#### Group each event into the following four categories

Issues: IssueCommentEvent, IssuesEvent  
Pull Requests: PullRequestEvent, PullRequestReviewCommentEvent  
Commits: CommitCommentEvent, PushEvent  
Repository: CreateEvent, DeleteEvent, ForkEvent, GollumEvent, MemberEvent, PublicEvent, ReleaseEvent, SponsorshipEvent, WatchEvent  

#### Hint:
1. Add a column called event_group that mentions which event group does that event belong to. Each event (row) should correspond to an event group.  
2. Then perform groupby on ['login','event_group'],  
3. use .agg (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html) to count the number of events performed by each contributor in each group,  
4. use pivot with the required arguments (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html). An example is given [here](#pivot_example) in question 2(b), and
5. Reset index and rename axis with None
7. Finally merge it with your 'acc_type' field from the DataFrame you created in Section4, drop contributor, and fillna with 0
8. The final DataFrame should have the following columns - login, PR, commit, issue, repo, acc_type

The final DataFrame should be looking like the following  
![](event_group.png)

In [None]:
# In the resultant DataFrame - each row should corespond to a contributor, and the columns should have all the event groups  
# and the type of the contributor (that you decided in the previous DataFrame) and the values should be the number 
# of events of that event event group the contriubutor has performed.

issue_group = ['IssueCommentEvent', 'IssuesEvent']
pr_group = ['PullRequestEvent', 'PullRequestReviewCommentEvent']
commit_group = ['CommitCommentEvent', 'PushEvent']
repo_group = ['CreateEvent', 'DeleteEvent', 'ForkEvent', 'GollumEvent', 'MemberEvent', 'PublicEvent', 'ReleaseEvent', 'SponsorshipEvent', 'WatchEvent']

In [None]:
# YOUR CODE BELOW


In [None]:
# YOUR CODE BELOW


### Question 2 (a)

#### Compute the median number of events per event group for Bot+Bot actors and Humans and write in DataFrame.

Row should correspond to type (Bot_BotActor and Human), Column should have Event group name and the values should be the median value of Bot_BotActor or Human for that particular event group. An example is given below

In [None]:
# For example:
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot_app'}, 
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot_app'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot_app
1,event_group1,val2,Human
2,event_group2,val3,Bot_app
3,event_group2,val4,Human
4,event_group3,val5,Bot_app
5,event_group3,val6,Human
6,event_group4,val7,Bot_app
7,event_group5,val8,Human


In [None]:
# YOUR CODE BELOW

### Question 2 (b)

Plot a heatmap of the DataFrame using seaborn - 
1. First convert the dataframe to the required format using pivot, example is given below
2. plot using seaborn - sns.heatmap(df_medians, annot=True, vmin=0, vmax=300, cmap="crest"). More details: https://seaborn.pydata.org/generated/seaborn.heatmap.html)

#### pd.pivot example:
<a id='pivot_example'></a>

In [None]:
# Main DataFrame
medians = [{'event_group': 'event_group1', 'median': 'val1', 'acc_type': 'Bot'}, 
           {'event_group': 'event_group1', 'median': 'val2', 'acc_type': 'Human'},
           {'event_group': 'event_group2', 'median': 'val3', 'acc_type': 'Bot'},
           {'event_group': 'event_group2', 'median': 'val4', 'acc_type': 'Human'},
           {'event_group': 'event_group3', 'median': 'val5', 'acc_type': 'Bot'},
           {'event_group': 'event_group3', 'median': 'val6', 'acc_type': 'Human'},
           {'event_group': 'event_group4', 'median': 'val7', 'acc_type': 'Bot'},
           {'event_group': 'event_group5', 'median': 'val8', 'acc_type': 'Human'}]
df_medians = pd.DataFrame.from_dict(medians)
df_medians

Unnamed: 0,event_group,median,acc_type
0,event_group1,val1,Bot
1,event_group1,val2,Human
2,event_group2,val3,Bot
3,event_group2,val4,Human
4,event_group3,val5,Bot
5,event_group3,val6,Human
6,event_group4,val7,Bot
7,event_group5,val8,Human


In [None]:
# pivot the main DataFrame
df_medians.pivot(index='acc_type', columns='event_group', values='median')#.reset_index().rename_axis(None,axis=1)

event_group,event_group1,event_group2,event_group3,event_group4,event_group5
acc_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bot,val1,val3,val5,val7,
Human,val2,val4,val6,,val8


In [None]:
# YOUR CODE BELOW

In [None]:
# YOUR CODE BELOW


#### What is the difference that you observe between Bots+Bot actors and Humans?

#### What is the differnce that you observe between Event groups?

#### What is the difference that you observe between Bots+Bot actors and Humans and Event groups all considering at the same time?

### Question 3

#### Create boxen plots to visualise the distribution of number of events in each event group. 
#### For more information you can visit - https://seaborn.pydata.org/generated/seaborn.boxenplot.html#seaborn.boxenplot 
#### You should highlight the data points that correspond to bots using a stripplot in seaborn. https://seaborn.pydata.org/tutorial/categorical.html#categorical-tutorial  
#### Interpret the results of the visualisation.

In [None]:
# YOUR CODE BELOW - Visualize number of events from pull request event group


In [None]:
# YOUR CODE BELOW - Visualise number of events from issue event group


In [None]:
# YOUR CODE BELOW - Visualise number of events from commit event group


In [None]:
# YOUR CODE BELOW - Visualise number of events from repo event group


### Question 4.1

#### Statistical identify whether number of events in each event group is normally distributed or not.
#### Null hypothesis - $H_0$: Sample comes from the data that has normal distribution.
#### Use Shapiro-Wilk test for this purpose. Use the p-value with a threshold of 0.05 to determine whether $H_0$ can be rejected with statistical significance or not. 

#### Use shapiro from scipy.stats to perform this test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html)

In [None]:
# YOUR CODE BELOW

### Question 4.2
#### Determine if there is any statistical diﬀerence in number of events between the identified event groups. Perform this test on all pairs of event groups. 
#### Null hypothesis - $H_0$: Any two event group come from the same population.
#### If $H_0$ is rejected in Shapiro-Wilk test (at least for one of the two event groups considered for test), use the Mann-Whitney U statistical test for this purpose. If $H_0$ is not rejected in Shapiro-Wilk test, use the independent t-test for this purpose.  
#### In any case, use the p-value with a threshold of 0.01 to determine whether $H_0$ can be rejected with statistical significance. 

#### Use mannwhitneyu from scipy.stats to perform Mann-Whitney U test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html) 
#### or 
#### ttest_ind from scipy.stats to perform independent t-test https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html.

#### More information:

You can pass the following arguments to mannwhitneyu from scipy.stats - (method='exact', nan_policy='omit'). For ttest_ind you can use - (nan_policy='omit')

In [None]:
# YOUR CODE BELOW

### Question 4.3
#### Each time you reject the null hypothesis $H_0$, quantify the effect size of the diﬀerence between the groups using cliﬀ’s delta ($\delta$). 
#### To calculate cliﬀ’s delta, you can pass the list of values to cliﬀ delta.py file given in the repository. E.g., cliffsDelta.cliffsDelta(list of values, list of values). This will return the effect size.
#### Refer to the table given in the TP document and mention your interpretation (negligible, small, medium, large).

In [None]:
# YOU CODE BELOW