# Finding the Treatment Group

#### Libraries

In [1]:
import pandas as pd
import requests
import os
import sys
from config import config

In [2]:
access_token = config.ACCESS_TOKEN
github_token = config.GITHUB_TOKEN

#### Download the example file

In [4]:
# Ensure the 'data' folder exists
if not os.path.exists('data'):
    os.makedirs('data')

url = 'https://data.gharchive.org/2023-04-01-15.json.gz'
file_path = os.path.join('data', '2023-04-01-15.json.gz')

# Check if the file already exists
if os.path.exists(file_path):
    print(f"The file already exists at {file_path}. No need to download.")
else:
    response = requests.get(url, stream=True)
    # Check if the request was successful (HTTP Status Code 200)
    if response.status_code == 200:
        # Write the file
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)
        print(f"File downloaded successfully and saved to {file_path}")
    else:
        print("Failed to fetch the file")


The file already exists at data\2023-04-01-15.json.gz. No need to download.


In [5]:
#### read it in

In [9]:
df = pd.read_json("data/2023-04-01-15.json.gz", lines=True)
df.head()

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,28137501092,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 568277185, 'name': 'stdlib-js/strided-b...","{'repository_id': 568277185, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,"{'id': 17805691, 'login': 'stdlib-js', 'gravat..."
1,28137501094,CreateEvent,"{'id': 115239975, 'login': 'ishuduwal', 'displ...","{'id': 622248753, 'name': 'ishuduwal/personal-...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
2,28137501097,CreateEvent,"{'id': 50960481, 'login': 'bxbao87', 'display_...","{'id': 622248756, 'name': 'bxbao87/bloglist', ...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
3,28137501098,PushEvent,"{'id': 52915358, 'login': 'alwaz-shahid', 'dis...","{'id': 622201481, 'name': 'alwaz-shahid/extens...","{'repository_id': 622201481, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,
4,28137501099,CreateEvent,"{'id': 101326737, 'login': 'HOVADOVOLE', 'disp...","{'id': 622248605, 'name': 'HOVADOVOLE/Serial-P...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,


### Getting rid of Github Bot commits

In [None]:
filtered_df = df[~df['actor'].apply(lambda x: x.get('login').endswith('[bot]'))]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.head()

## Email Analysis
I added a limit of 100 "df['actor'][:100]"
Then I didnt find any email :(

In [11]:
import json
from graphqlclient import GraphQLClient
from config import config

def get_client():
    client = GraphQLClient('https://api.github.com/graphql')
    client.inject_token(f'Bearer {github_token}')
    return client

def execute_query(client, query, variables=None):
    try:
        result = client.execute(query, variables)
        return json.loads(result)
    except Exception as e:
        print(f"Error executing query: {str(e)}")
        return None

def get_user_email(client, username):
    query = """
    query($username: String!) {
      user(login: $username) {
        email
      }
    }
    """
    variables = {"username": username}
    
    response_json = execute_query(client, query, variables)
    
    if response_json and "errors" in response_json:
        error_message = response_json.get("errors")[0].get("message")
        print(f"Error querying user {username}: {error_message}")
        return None
    
    return response_json["data"]["user"]["email"] if response_json else None

# Example usage:
if __name__ == "__main__":
    client = get_client()
    emails = []

    for row in df['actor'][:100]:
        login = row.get('login')
        if login:
            email = get_user_email(client, login)
            if email:
                emails.append(email)
            else:
                print(f"No email found for user {login}")

    print(emails)


Error querying user github-actions[bot]: Could not resolve to a User with the login of 'github-actions[bot]'.
No email found for user github-actions[bot]
No email found for user ishuduwal
No email found for user bxbao87
No email found for user alwaz-shahid
No email found for user HOVADOVOLE
No email found for user FlorBera
No email found for user LombiqBot
No email found for user thuanowa
Error querying user Ankityadavk: Could not resolve to a User with the login of 'Ankityadavk'.
No email found for user Ankityadavk
No email found for user gokaysatir
Error querying user dependabot[bot]: Could not resolve to a User with the login of 'dependabot[bot]'.
No email found for user dependabot[bot]
No email found for user pultho
No email found for user rswnsyh
Error querying user shopify[bot]: Could not resolve to a User with the login of 'shopify[bot]'.
No email found for user shopify[bot]
No email found for user Rindraniaina-28
Error querying user renovate[bot]: Could not resolve to a User wi

In [14]:
emails

[]

In [16]:
it_emails = []

for email in emails:
    if email.endswith(".ca"):
        it_emails.append(email)

print("Emails ending with '.it':")
for it_email in it_emails:
    print(it_email)

count_it_emails = len(it_emails)
print(f"Number of emails ending with '.it': {count_it_emails}")

Emails ending with '.it':
Number of emails ending with '.it': 0
