# Finding the Treatment Group

#### Libraries

In [1]:
import pandas as pd
import requests
import os
import sys
from config import config

In [2]:
access_token = config.ACCESS_TOKEN
github_token = config.GITHUB_TOKEN

#### Download the example file

In [4]:
# Ensure the 'data' folder exists
if not os.path.exists('data'):
    os.makedirs('data')

url = 'https://data.gharchive.org/2023-04-01-15.json.gz'
file_path = os.path.join('data', '2023-04-01-15.json.gz')

# Check if the file already exists
if os.path.exists(file_path):
    print(f"The file already exists at {file_path}. No need to download.")
else:
    response = requests.get(url, stream=True)
    # Check if the request was successful (HTTP Status Code 200)
    if response.status_code == 200:
        # Write the file
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)
        print(f"File downloaded successfully and saved to {file_path}")
    else:
        print("Failed to fetch the file")


The file already exists at data\2023-04-01-15.json.gz. No need to download.


In [5]:
#### read it in

In [9]:
df = pd.read_json("data/2023-04-01-15.json.gz", lines=True)
df.head()

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,28137501092,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 568277185, 'name': 'stdlib-js/strided-b...","{'repository_id': 568277185, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,"{'id': 17805691, 'login': 'stdlib-js', 'gravat..."
1,28137501094,CreateEvent,"{'id': 115239975, 'login': 'ishuduwal', 'displ...","{'id': 622248753, 'name': 'ishuduwal/personal-...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
2,28137501097,CreateEvent,"{'id': 50960481, 'login': 'bxbao87', 'display_...","{'id': 622248756, 'name': 'bxbao87/bloglist', ...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
3,28137501098,PushEvent,"{'id': 52915358, 'login': 'alwaz-shahid', 'dis...","{'id': 622201481, 'name': 'alwaz-shahid/extens...","{'repository_id': 622201481, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,
4,28137501099,CreateEvent,"{'id': 101326737, 'login': 'HOVADOVOLE', 'disp...","{'id': 622248605, 'name': 'HOVADOVOLE/Serial-P...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,


### Getting rid of Github Bot commits

In [None]:
filtered_df = df[~df['actor'].apply(lambda x: x.get('login').endswith('[bot]'))]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.head()

## Email Analysis
I added a limit of 100 "df['actor'][:100]"

With this limit I didnt find any email :(

In [11]:
import json
from graphqlclient import GraphQLClient
from config import config

def get_client():
    client = GraphQLClient('https://api.github.com/graphql')
    client.inject_token(f'Bearer {github_token}')
    return client

def execute_query(client, query, variables=None):
    try:
        result = client.execute(query, variables)
        return json.loads(result)
    except Exception as e:
        print(f"Error executing query: {str(e)}")
        return None

def get_user_email(client, username):
    query = """
    query($username: String!) {
      user(login: $username) {
        email
      }
    }
    """
    variables = {"username": username}
    
    response_json = execute_query(client, query, variables)
    
    if response_json and "errors" in response_json:
        error_message = response_json.get("errors")[0].get("message")
        print(f"Error querying user {username}: {error_message}")
        return None
    
    return response_json["data"]["user"]["email"] if response_json else None

# Example usage:
if __name__ == "__main__":
    client = get_client()
    emails = []

    for row in df['actor'][:100]:
        login = row.get('login')
        if login:
            email = get_user_email(client, login)
            if email:
                emails.append(email)
            else:
                print(f"No email found for user {login}")

    print(emails)


Error querying user github-actions[bot]: Could not resolve to a User with the login of 'github-actions[bot]'.
No email found for user github-actions[bot]
No email found for user ishuduwal
No email found for user bxbao87
No email found for user alwaz-shahid
No email found for user HOVADOVOLE
No email found for user FlorBera
No email found for user LombiqBot
No email found for user thuanowa
Error querying user Ankityadavk: Could not resolve to a User with the login of 'Ankityadavk'.
No email found for user Ankityadavk
No email found for user gokaysatir
Error querying user dependabot[bot]: Could not resolve to a User with the login of 'dependabot[bot]'.
No email found for user dependabot[bot]
No email found for user pultho
No email found for user rswnsyh
Error querying user shopify[bot]: Could not resolve to a User with the login of 'shopify[bot]'.
No email found for user shopify[bot]
No email found for user Rindraniaina-28
Error querying user renovate[bot]: Could not resolve to a User wi

In [14]:
emails

[]

In [16]:
it_emails = []

for email in emails:
    if email.endswith(".ca"):
        it_emails.append(email)

print("Emails ending with '.it':")
for it_email in it_emails:
    print(it_email)

count_it_emails = len(it_emails)
print(f"Number of emails ending with '.it': {count_it_emails}")

Emails ending with '.it':
Number of emails ending with '.it': 0


## 6. Analyzing org descriptions
Use GraphQL to get more information at once

In [23]:
unique_orgs = pd.Series(df['org'].dropna().apply(lambda x: x['login'])).unique()


['stdlib-js' 'Lombiq' 'CollaboraOnline' ... 'nocalhost' 'linnovate'
 'coreweave']


In [25]:
# List of unique organization logins obtained from your DataFrame
unique_org_list = unique_orgs[100:900]

# Setup GraphQL Client
client = GraphQLClient('https://api.github.com/graphql')
client.inject_token(f'Bearer {github_token}')

# Define GraphQL Query
query = """
query($login: String!) {
  organization(login: $login) {
    login
    description
  }
}
"""

# List to store data for the new DataFrame
data = []

# Loop through each unique organization login
for org_login in unique_org_list:
    try:
        # Define variables for the query
        variables = {"login": org_login}
        
        # Execute the GraphQL query
        result = client.execute(query, variables)
        response_json = json.loads(result)
        
        # Check for errors in the response
        if "errors" in response_json:
            error_message = response_json['errors'][0]['message']
            print(f"Error querying organization {org_login}: {error_message}")
            continue
        
        # Extract the relevant data
        org_data = response_json['data']['organization']
        data.append([org_data['login'], org_data['description']])
        
    except Exception as err:
        # Handle potential errors
        print(f"An error occurred for {org_login}: {err}")

# Create DataFrame
df_orgs = pd.DataFrame(data, columns=['name', 'description'])

# Display the DataFrame
print(df_orgs.head())


Error querying organization FZM-Technologies: Could not resolve to an Organization with the login of 'FZM-Technologies'.
Error querying organization chainparrot: Could not resolve to an Organization with the login of 'chainparrot'.
Error querying organization playing-ground: Could not resolve to an Organization with the login of 'playing-ground'.
Error querying organization keptn-demo-live: Could not resolve to an Organization with the login of 'keptn-demo-live'.
Error querying organization Strada1: Could not resolve to an Organization with the login of 'Strada1'.
Error querying organization iruoy-nl: Could not resolve to an Organization with the login of 'iruoy-nl'.
Error querying organization eras-fyi: Could not resolve to an Organization with the login of 'eras-fyi'.
Error querying organization DataFest-2023-Algo-Rhythms: Could not resolve to an Organization with the login of 'DataFest-2023-Algo-Rhythms'.
Error querying organization codesquad2023-fe-algorithm: Could not resolve to a

### Lingua
https://github.com/pemistahl/lingua-py

In [34]:
#!pip install lingua-language-detector
from lingua import Language, LanguageDetectorBuilder
import pandas as pd

df_lingua = df_orgs

# Available languages
languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH, Language.ITALIAN]

# Build the language detector
detector = LanguageDetectorBuilder.from_languages(*languages).build()

# Function to detect language
def detect_language(text):
    try:
        return detector.detect_language_of(text).name
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_lingua['language'] = df_lingua['description'].apply(detect_language)

# Display the DataFrame
df_lingua[df_lingua["language"]=="ITALIAN"].head()


Unnamed: 0,name,description,language
305,polito-WA1-AW1-2023,Courses at Politecnico di Torino - Academic Ye...,ITALIAN
605,MOVIECORD,🎬 Movie | Film Festival | OTT,ITALIAN


### Langdetect

In [36]:
#!pip install langdetect
from langdetect import detect

df_langdetect = df_orgs

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_langdetect['language'] = df_langdetect['description'].apply(detect_language)

# Display the DataFrame
df_langdetect[df_langdetect["language"] == "ITALIAN"]

Unnamed: 0,name,description,language


## 7. Italian Open Source Licences
Problem is, that the authors are italian but not living in Italy!!

In [40]:
import requests
import re
import os

def get_and_process_github_readme(url, filename, token):
    """
    Download the README.md from GitHub, save it, extract and return the names of repositories.
    
    Parameters:
        url (str): The URL to the raw README.md on GitHub.
        filename (str): The name to save the README.md as.
        token (str): Your GitHub API token.
        
    Returns:
        list: Extracted repository names.
    """
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3.raw"
    }
    
    # Step 1: Download the README.md from GitHub
    response = requests.get(url, headers=headers)
    
    # Ensure the request was successful
    if response.status_code == 200:
        readme_content = response.text
        
        # Step 2: Process the saved file to extract repository names
        name_pattern = re.compile(r'\|\s*(?:\[(?P<link_name>.+?)\]\(.+?\)|(?P<plain_name>[^\|\[\]]+))\s*\|')
        names = []
        for line in readme_content.split('\n'):
            match = name_pattern.search(line)
            if match:
                name = (match.group('link_name') or match.group('plain_name')).strip()
                if name and name.lower() not in ["name", "----", "stack", "description"]:
                    names.append(name)
        return names
    else:
        raise ConnectionError(f"Failed to fetch README.md, status code: {response.status_code}")

# URL to the raw README.md on GitHub
readme_url = "https://github.com/italia-opensource/awesome-italia-opensource/raw/main/awesome/opensource/README.md"

# Filename to save the README.md as
save_filename = "github_readme.md"

# Execute the function and display the first 10 repository names
Italian_OS_projects = get_and_process_github_readme(readme_url, save_filename, github_token)[1:]
Italian_OS_projects[:10]

['2Ami',
 'Apivault',
 'Arduino',
 'Arduino Desk Weatherstation',
 'Argon',
 'Autocannon',
 'Awesome Italia Open Source',
 'Bootstrap Italia',
 'Breathly',
 'Cache Candidate']