# Finding the Treatment Group

Pre requisites
1. config.py in the config folder in the following format
ACCESS_TOKEN = "ghp_xxxx"
GITHUB_TOKEN = "github_pat_xxxxx"
2. data folder where all data will be stored

#### Libraries

In [1]:
import pandas as pd
import requests
import os
import sys
from config import config

In [2]:
access_token = config.ACCESS_TOKEN
github_token = config.GITHUB_TOKEN

#### Download the example file

In [57]:
# Ensure the 'data' folder exists
if not os.path.exists('data'):
    os.makedirs('data')

url = 'https://data.gharchive.org/2023-04-01-15.json.gz'
file_path = os.path.join('data', '2023-04-01-15.json.gz')

# Check if the file already exists
if os.path.exists(file_path):
    print(f"The file already exists at {file_path}. No need to download.")
else:
    response = requests.get(url, stream=True)
    # Check if the request was successful (HTTP Status Code 200)
    if response.status_code == 200:
        # Write the file
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)
        print(f"File downloaded successfully and saved to {file_path}")
    else:
        print("Failed to fetch the file")


The file already exists at data\2023-04-01-15.json.gz. No need to download.


#### read it in

In [58]:
df = pd.read_json("data/2023-04-01-15.json.gz", lines=True)
df.head()

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,28137501092,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 568277185, 'name': 'stdlib-js/strided-b...","{'repository_id': 568277185, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,"{'id': 17805691, 'login': 'stdlib-js', 'gravat..."
1,28137501094,CreateEvent,"{'id': 115239975, 'login': 'ishuduwal', 'displ...","{'id': 622248753, 'name': 'ishuduwal/personal-...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
2,28137501097,CreateEvent,"{'id': 50960481, 'login': 'bxbao87', 'display_...","{'id': 622248756, 'name': 'bxbao87/bloglist', ...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
3,28137501098,PushEvent,"{'id': 52915358, 'login': 'alwaz-shahid', 'dis...","{'id': 622201481, 'name': 'alwaz-shahid/extens...","{'repository_id': 622201481, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,
4,28137501099,CreateEvent,"{'id': 101326737, 'login': 'HOVADOVOLE', 'disp...","{'id': 622248605, 'name': 'HOVADOVOLE/Serial-P...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,


## 1. Identifying user location based on their GitHub profiles


####  GraphQL for batch requests to fetch data for multiple users in one request instead of making a request for each user and constantly hitting the rate limit for the REST API
1. GraphQL Test: Creates data subset, constructs/executes GraphQL query for user locations
2. Italian Identification: Uses Italian keywords to filter and display Italian users
- subset of 500 Users >> 3 Italians identified

In [42]:
subset_df2 = df.head(500).copy()

def sanitize_for_alias(username):
    return ''.join(ch if ch.isalnum() else '_' for ch in username)

logins = subset_df2['actor'].apply(lambda x: x['login']).tolist()

# Filter out logins that start with a number
logins = [login for login in logins if not login[0].isdigit()]

# Construct GraphQL query
query_parts = [f'''
{sanitize_for_alias(login)}: user(login: "{login}") {{
    location
}}
''' for login in logins]
query = '{' + ''.join(query_parts) + '}'

# Execute GraphQL query
headers = {
    'Authorization': 'bearer '+github_token,
    'Content-Type': 'application/json'
}

response = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
response_json = response.json()

if 'data' not in response_json:
    print("Error in response:", response_json)
    # Halt execution in a Jupyter notebook or similar environment
    raise Exception("Data key missing from response")

data = response_json['data']

# Update subset_df2 with fetched location data
for login, user_data in data.items():
    if user_data is None:
        print(f"No data for user: {login}")
        continue
    location = user_data.get('location', None)
    subset_df2.loc[subset_df2['actor'].apply(lambda x: x['login']) == login, 'actor'] = subset_df2['actor'].apply(
        lambda x: {**x, 'location': location} if x['login'] == login else x
    )

No data for user: github_actions_bot_
No data for user: Ankityadavk
No data for user: dependabot_bot_
No data for user: shopify_bot_
No data for user: renovate_bot_
No data for user: pull_bot_
No data for user: vercel_bot_
No data for user: richardaeh
No data for user: imgbot_bot_
No data for user: treaty321
No data for user: bakiamna99999999999
No data for user: stale_bot_
No data for user: cyousemi
No data for user: netlify_bot_
No data for user: GeneralAwareness
No data for user: aws_connector_for_github_bot_
No data for user: M_Lalaina


In [46]:
#!pip install fuzzywuzzy
#!pip install python-Levenshtein
import fuzzywuzzy
from fuzzywuzzy import process

# List of major cities in Italy and other possible indications of an Italian location
italian_keywords = [
    "rome", "roma", "milan", "milano", "naples", "napoli", "turin", "torino", "palermo", 
    "genoa", "genova", "bologna", "florence", "firenze", "venice", "venezia", "verona", 
    "cagliari", "parma", "ferrara", "treviso", "padua", "padova", "trieste", "taranto", 
    "brescia", "prato", "modena", "reggio", "calabria", "emilia", "perugia", "livorno", 
    "ravenna", "foggia", "rimini", "salerno", "sassari", "latina", "giugliano", "tuscany", 
    "toscana", "sicily", "sicilia", "sardinia", "sardegna", "lombardy", "lombardia", "piedmont", 
    "piemonte", "liguria", "calabria", "umbria", "marche", "abruzzo", "italy", "italia"
]

def is_italian_location(location):
    if not location:
        return False
    location = location.lower()
    if any(keyword in location for keyword in italian_keywords):
        return True
    # Using fuzzy matching to account for typos
    closest_match, score = process.extractOne(location, italian_keywords)
    return score > 80

# Filter out rows with Italian locations
non_italian_df = subset_df2[~subset_df2['actor'].apply(lambda x: is_italian_location(x.get('location')))]

In [49]:
# Filter in rows with Italian locations
italian_df = subset_df2[subset_df2['actor'].apply(lambda x: is_italian_location(x.get('location')))]

# Flatten the 'actor' column from the italian_df
flattened_italian_actor_df = pd.json_normalize(italian_df['actor'])

# Display the flattened 'actor' column
flattened_italian_actor_df

Unnamed: 0,id,login,display_login,gravatar_id,url,avatar_url,location
0,16925025,maffo102,maffo102,,https://api.github.com/users/maffo102,https://avatars.githubusercontent.com/u/16925025?,Italy
1,30238962,merkleID,merkleID,,https://api.github.com/users/merkleID,https://avatars.githubusercontent.com/u/30238962?,milan
2,117077787,Nelexiad,Nelexiad,,https://api.github.com/users/Nelexiad,https://avatars.githubusercontent.com/u/117077...,Palermo


## Email Analysis
I added a limit of 100 "df['actor'][:100]"

With this limit I didnt find any email :(

In [11]:
import json
from graphqlclient import GraphQLClient
from config import config

def get_client():
    client = GraphQLClient('https://api.github.com/graphql')
    client.inject_token(f'Bearer {github_token}')
    return client

def execute_query(client, query, variables=None):
    try:
        result = client.execute(query, variables)
        return json.loads(result)
    except Exception as e:
        print(f"Error executing query: {str(e)}")
        return None

def get_user_email(client, username):
    query = """
    query($username: String!) {
      user(login: $username) {
        email
      }
    }
    """
    variables = {"username": username}
    
    response_json = execute_query(client, query, variables)
    
    if response_json and "errors" in response_json:
        error_message = response_json.get("errors")[0].get("message")
        print(f"Error querying user {username}: {error_message}")
        return None
    
    return response_json["data"]["user"]["email"] if response_json else None

# Example usage:
if __name__ == "__main__":
    client = get_client()
    emails = []

    for row in df['actor'][:100]:
        login = row.get('login')
        if login:
            email = get_user_email(client, login)
            if email:
                emails.append(email)
            else:
                print(f"No email found for user {login}")

    print(emails)


Error querying user github-actions[bot]: Could not resolve to a User with the login of 'github-actions[bot]'.
No email found for user github-actions[bot]
No email found for user ishuduwal
No email found for user bxbao87
No email found for user alwaz-shahid
No email found for user HOVADOVOLE
No email found for user FlorBera
No email found for user LombiqBot
No email found for user thuanowa
Error querying user Ankityadavk: Could not resolve to a User with the login of 'Ankityadavk'.
No email found for user Ankityadavk
No email found for user gokaysatir
Error querying user dependabot[bot]: Could not resolve to a User with the login of 'dependabot[bot]'.
No email found for user dependabot[bot]
No email found for user pultho
No email found for user rswnsyh
Error querying user shopify[bot]: Could not resolve to a User with the login of 'shopify[bot]'.
No email found for user shopify[bot]
No email found for user Rindraniaina-28
Error querying user renovate[bot]: Could not resolve to a User wi

In [14]:
emails

[]

In [16]:
it_emails = []

for email in emails:
    if email.endswith(".ca"):
        it_emails.append(email)

print("Emails ending with '.it':")
for it_email in it_emails:
    print(it_email)

count_it_emails = len(it_emails)
print(f"Number of emails ending with '.it': {count_it_emails}")

Emails ending with '.it':
Number of emails ending with '.it': 0


## 6. Analyzing org descriptions
Use GraphQL to get more information at once

In [23]:
unique_orgs = pd.Series(df['org'].dropna().apply(lambda x: x['login'])).unique()

['stdlib-js' 'Lombiq' 'CollaboraOnline' ... 'nocalhost' 'linnovate'
 'coreweave']


In [25]:
# List of unique organization logins obtained from your DataFrame
unique_org_list = unique_orgs[100:900]

# Setup GraphQL Client
client = GraphQLClient('https://api.github.com/graphql')
client.inject_token(f'Bearer {github_token}')

# Define GraphQL Query
query = """
query($login: String!) {
  organization(login: $login) {
    login
    description
  }
}
"""

# List to store data for the new DataFrame
data = []

# Loop through each unique organization login
for org_login in unique_org_list:
    try:
        # Define variables for the query
        variables = {"login": org_login}
        
        # Execute the GraphQL query
        result = client.execute(query, variables)
        response_json = json.loads(result)
        
        # Check for errors in the response
        if "errors" in response_json:
            error_message = response_json['errors'][0]['message']
            print(f"Error querying organization {org_login}: {error_message}")
            continue
        
        # Extract the relevant data
        org_data = response_json['data']['organization']
        data.append([org_data['login'], org_data['description']])
        
    except Exception as err:
        # Handle potential errors
        print(f"An error occurred for {org_login}: {err}")

# Create DataFrame
df_orgs = pd.DataFrame(data, columns=['name', 'description'])

# Display the DataFrame
print(df_orgs.head())


Error querying organization FZM-Technologies: Could not resolve to an Organization with the login of 'FZM-Technologies'.
Error querying organization chainparrot: Could not resolve to an Organization with the login of 'chainparrot'.
Error querying organization playing-ground: Could not resolve to an Organization with the login of 'playing-ground'.
Error querying organization keptn-demo-live: Could not resolve to an Organization with the login of 'keptn-demo-live'.
Error querying organization Strada1: Could not resolve to an Organization with the login of 'Strada1'.
Error querying organization iruoy-nl: Could not resolve to an Organization with the login of 'iruoy-nl'.
Error querying organization eras-fyi: Could not resolve to an Organization with the login of 'eras-fyi'.
Error querying organization DataFest-2023-Algo-Rhythms: Could not resolve to an Organization with the login of 'DataFest-2023-Algo-Rhythms'.
Error querying organization codesquad2023-fe-algorithm: Could not resolve to a

### Lingua
https://github.com/pemistahl/lingua-py

In [34]:
#!pip install lingua-language-detector
from lingua import Language, LanguageDetectorBuilder
import pandas as pd

df_lingua = df_orgs

# Available languages
languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH, Language.ITALIAN]

# Build the language detector
detector = LanguageDetectorBuilder.from_languages(*languages).build()

# Function to detect language
def detect_language(text):
    try:
        return detector.detect_language_of(text).name
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_lingua['language'] = df_lingua['description'].apply(detect_language)

# Display the DataFrame
df_lingua[df_lingua["language"]=="ITALIAN"].head()


Unnamed: 0,name,description,language
305,polito-WA1-AW1-2023,Courses at Politecnico di Torino - Academic Ye...,ITALIAN
605,MOVIECORD,🎬 Movie | Film Festival | OTT,ITALIAN


### Langdetect

In [36]:
#!pip install langdetect
from langdetect import detect

df_langdetect = df_orgs

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_langdetect['language'] = df_langdetect['description'].apply(detect_language)

# Display the DataFrame
df_langdetect[df_langdetect["language"] == "ITALIAN"]

Unnamed: 0,name,description,language


## 7. Italian Open Source Licences
Problem is, that the authors are italian but not living in Italy!!

In [40]:
import requests
import re
import os

def get_and_process_github_readme(url, filename, token):
    """
    Download the README.md from GitHub, save it, extract and return the names of repositories.
    
    Parameters:
        url (str): The URL to the raw README.md on GitHub.
        filename (str): The name to save the README.md as.
        token (str): Your GitHub API token.
        
    Returns:
        list: Extracted repository names.
    """
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3.raw"
    }
    
    # Step 1: Download the README.md from GitHub
    response = requests.get(url, headers=headers)
    
    # Ensure the request was successful
    if response.status_code == 200:
        readme_content = response.text
        
        # Step 2: Process the saved file to extract repository names
        name_pattern = re.compile(r'\|\s*(?:\[(?P<link_name>.+?)\]\(.+?\)|(?P<plain_name>[^\|\[\]]+))\s*\|')
        names = []
        for line in readme_content.split('\n'):
            match = name_pattern.search(line)
            if match:
                name = (match.group('link_name') or match.group('plain_name')).strip()
                if name and name.lower() not in ["name", "----", "stack", "description"]:
                    names.append(name)
        return names
    else:
        raise ConnectionError(f"Failed to fetch README.md, status code: {response.status_code}")

# URL to the raw README.md on GitHub
readme_url = "https://github.com/italia-opensource/awesome-italia-opensource/raw/main/awesome/opensource/README.md"

# Filename to save the README.md as
save_filename = "github_readme.md"

# Execute the function and display the first 10 repository names
Italian_OS_projects = get_and_process_github_readme(readme_url, save_filename, github_token)[1:]
Italian_OS_projects[:10]

['2Ami',
 'Apivault',
 'Arduino',
 'Arduino Desk Weatherstation',
 'Argon',
 'Autocannon',
 'Awesome Italia Open Source',
 'Bootstrap Italia',
 'Breathly',
 'Cache Candidate']

## Extras

### downloading data for multiple days/hours

In [59]:
import os
import requests
import gzip
import pandas as pd
import logging
from typing import List, Union

# Configure logging
logging.basicConfig(level=logging.INFO)

BASE_URL = "https://data.gharchive.org"

def ensure_directory_exists(dir_name: str) -> None:
    """Ensure the specified directory exists."""
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

def download_file(url: str, file_path: str) -> None:
    """Download file and save it to the specified path."""
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=128):
                    f.write(chunk)
        else:
            logging.warning(f"File not found: {url}")
    except requests.RequestException as e:
        logging.error(f"Failed to fetch {url}: {str(e)}")

def load_or_fetch_data(file_path: str, url: str) -> Union[pd.DataFrame, None]:
    """Load data from file or fetch from URL if not exists."""
    if os.path.exists(file_path):
        logging.info(f"File {file_path} already exists. Loading data.")
        with gzip.open(file_path, 'rt', encoding='utf-8') as gz:
            return pd.read_json(gz, lines=True)
    else:
        logging.info(f"File {file_path} not exists. Downloading from {url}")
        download_file(url, file_path)
        if os.path.exists(file_path):
            with gzip.open(file_path, 'rt', encoding='utf-8') as gz:
                return pd.read_json(gz, lines=True)
    return None

def download_gh_archive(start_day: int, end_day: int, 
                        start_hour: int=0, end_hour: int=23,
                        base_url: str=BASE_URL, data_dir: str='data') -> pd.DataFrame:
    """Download GitHub archive data for specified days and hours."""
    dfs = []
    ensure_directory_exists(data_dir)
    
    for day in range(start_day, end_day + 1):
        for hour in range(start_hour, end_hour + 1):
            filename = f"2023-04-{day:02d}-{hour}.json.gz"
            url = f"{base_url}/{filename}"
            file_path = os.path.join(data_dir, filename)
            
            df = load_or_fetch_data(file_path, url)
            if df is not None:
                dfs.append(df)

    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [None]:
# Record the start time
start_time = time.time()
#Define the start and end date-time
start_date_time = "2023-04-01 10"  # Format: "YYYY-MM-DD HH"
end_date_time = "2023-04-01 12"    # Format: "YYYY-MM-DD HH"
# Extract day and hour from the date-time strings
start_day = int(start_date_time.split("-")[2].split()[0])
start_hour = int(start_date_time.split()[1])
end_day = int(end_date_time.split("-")[2].split()[0])
end_hour = int(end_date_time.split()[1])

# Download data for the specified time frame
df_big = download_gh_archive(start_day, end_day, start_hour, end_hour)

# Record the end time and calculate the elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Data from {start_date_time} to {end_date_time} loaded into DataFrame!")
print(f"Time taken: {elapsed_time:.2f} seconds")

### Getting rid of Github Bot commits

In [None]:
filtered_df = df[~df['actor'].apply(lambda x: x.get('login').endswith('[bot]'))]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.head()