# Finding the Treatment Group

In [9]:
test 

NameError: name 'test' is not defined

Pre requisites
1. config.py in the config folder in the following format
ACCESS_TOKEN = "ghp_xxxx"
GITHUB_TOKEN = "github_pat_xxxxx"
2. data folder where all data will be stored

#### Libraries

In [None]:
!pip install langdetect
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install graphqlclient
!pip install lingua-language-detector
!pip install spacy pandas spacy-langdetect
!python -m spacy download en

In [None]:
!pip install config

In [1]:
import pandas as pd
import requests
import os
import sys
from config import config
import time

In [2]:
access_token = config.ACCESS_TOKEN
github_token = config.GITHUB_TOKEN

#### Download the example file

In [3]:
# Ensure the 'data' folder exists
if not os.path.exists('data'):
    os.makedirs('data')

url = 'https://data.gharchive.org/2023-04-01-15.json.gz'
file_path = os.path.join('data', '2023-04-01-15.json.gz')

# Check if the file already exists
if os.path.exists(file_path):
    print(f"The file already exists at {file_path}. No need to download.")
else:
    response = requests.get(url, stream=True)
    # Check if the request was successful (HTTP Status Code 200)
    if response.status_code == 200:
        # Write the file
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)
        print(f"File downloaded successfully and saved to {file_path}")
    else:
        print("Failed to fetch the file")


File downloaded successfully and saved to data/2023-04-01-15.json.gz


#### read it in

In [4]:
chunk_size = 10000  # Adjust based on your system's memory
chunks = []
for chunk in pd.read_json("data/2023-04-01-15.json.gz", lines=True, chunksize=chunk_size):
    # Process the chunk of data here if possible
    chunks.append(chunk)

# If needed, concatenate chunks back into a single DataFrame
df = pd.concat(chunks, ignore_index=True)

df.head()

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,28137501092,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 568277185, 'name': 'stdlib-js/strided-b...","{'repository_id': 568277185, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,"{'id': 17805691, 'login': 'stdlib-js', 'gravat..."
1,28137501094,CreateEvent,"{'id': 115239975, 'login': 'ishuduwal', 'displ...","{'id': 622248753, 'name': 'ishuduwal/personal-...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
2,28137501097,CreateEvent,"{'id': 50960481, 'login': 'bxbao87', 'display_...","{'id': 622248756, 'name': 'bxbao87/bloglist', ...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
3,28137501098,PushEvent,"{'id': 52915358, 'login': 'alwaz-shahid', 'dis...","{'id': 622201481, 'name': 'alwaz-shahid/extens...","{'repository_id': 622201481, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,
4,28137501099,CreateEvent,"{'id': 101326737, 'login': 'HOVADOVOLE', 'disp...","{'id': 622248605, 'name': 'HOVADOVOLE/Serial-P...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,


## 1. Identifying user location based on their GitHub profiles information


####  GraphQL for batch requests to fetch data for multiple users in one request instead of making a request for each user and constantly hitting the rate limit for the REST API
1. GraphQL Test: Creates data subset, constructs/executes GraphQL query for user locations
2. Italian Identification: Uses Italian keywords to filter and display Italian users
- subset of 500 Users >> 3 Italians identified

In [5]:
# Define batch size
BATCH_SIZE = 800  # Choose a reasonable size that avoids timeouts but minimizes the number of requests

def sanitize_for_alias(name):
    valid_starters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
    sanitized_name = ''.join(ch if ch.isalnum() else '_' for ch in name)
    if sanitized_name[0] not in valid_starters:
        sanitized_name = 'a' + sanitized_name
    return sanitized_name

def construct_query(logins_batch):
    query_parts = [
        f'''
        {sanitize_for_alias(login)}: user(login: "{login}") {{
            location
        }}
        ''' for login in logins_batch
    ]
    return '{' + ''.join(query_parts) + '}'

def fetch_data(query, github_token):
    headers = {
        'Authorization': 'bearer ' + github_token,
        'Content-Type': 'application/json'
    }
    response = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    response_json = response.json()

    if response_json is None:
        raise Exception("No response from API")
    if 'data' not in response_json:
        raise Exception("Data key missing from response:", response_json)
    
    return response_json['data']

def update_location(data):
    if data is None:
        print("No data to update.")
        return
    
    for login, user_data in data.items():
        if user_data is None:
            #print(f"No data for user: {login}")
            continue
        location = user_data.get('location', None)
        mask = df['actor'].apply(lambda x: x['login']) == login
        df.loc[mask, 'actor'] = df.loc[mask, 'actor'].apply(
            lambda x: {**x, 'location': location}
        )

# Get unique logins
logins = df['actor'].apply(lambda x: x['login']).unique().tolist()
logins = [login for login in logins if not login[0].isdigit()]

# A dictionary to store fetched locations
locations = {}

# Divide logins into batches and fetch data for each batch
for i in range(0, len(logins), BATCH_SIZE):
    logins_batch = logins[i:i + BATCH_SIZE]
    query = construct_query(logins_batch)
    data = fetch_data(query, github_token)
    
    # Update the locations dictionary instead of the DataFrame directly
    for login, user_data in data.items():
        if user_data is not None:
            locations[login] = user_data.get('location', None)
    
    # Consider adding a sleep to avoid hitting rate limits
    time.sleep(1)

# Update the DataFrame in a vectorized manner
df['actor'] = df['actor'].apply(
    lambda x: {**x, 'location': locations.get(x['login'], None)} if x['login'] in locations else x
)
df.head()

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,28137501092,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 568277185, 'name': 'stdlib-js/strided-b...","{'repository_id': 568277185, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,"{'id': 17805691, 'login': 'stdlib-js', 'gravat..."
1,28137501094,CreateEvent,"{'id': 115239975, 'login': 'ishuduwal', 'displ...","{'id': 622248753, 'name': 'ishuduwal/personal-...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
2,28137501097,CreateEvent,"{'id': 50960481, 'login': 'bxbao87', 'display_...","{'id': 622248756, 'name': 'bxbao87/bloglist', ...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,
3,28137501098,PushEvent,"{'id': 52915358, 'login': 'alwaz-shahid', 'dis...","{'id': 622201481, 'name': 'alwaz-shahid/extens...","{'repository_id': 622201481, 'push_id': 131547...",True,2023-04-01 15:00:00+00:00,
4,28137501099,CreateEvent,"{'id': 101326737, 'login': 'HOVADOVOLE', 'disp...","{'id': 622248605, 'name': 'HOVADOVOLE/Serial-P...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2023-04-01 15:00:00+00:00,


In [6]:
df["actor"].head()

0    {'id': 41898282, 'login': 'github-actions[bot]...
1    {'id': 115239975, 'login': 'ishuduwal', 'displ...
2    {'id': 50960481, 'login': 'bxbao87', 'display_...
3    {'id': 52915358, 'login': 'alwaz-shahid', 'dis...
4    {'id': 101326737, 'login': 'HOVADOVOLE', 'disp...
Name: actor, dtype: object

In [7]:
import pandas as pd
from fuzzywuzzy import process

# List of major cities in Italy and other possible indications of an Italian location
italian_keywords = [
    "rome", "roma", "milan", "milano", "naples", "napoli", "turin", "torino", "palermo", 
    "genoa", "genova", "bologna", "florence", "firenze", "venice", "venezia", "verona", 
    "cagliari", "parma", "ferrara", "treviso", "padua", "padova", "trieste", "taranto", 
    "brescia", "prato", "modena", "reggio", "calabria", "emilia", "perugia", "livorno", 
    "ravenna", "foggia", "rimini", "salerno", "sassari", "latina", "giugliano", "tuscany", 
    "toscana", "sicily", "sicilia", "sardinia", "sardegna", "lombardy", "lombardia", "piedmont", 
    "piemonte", "liguria", "calabria", "umbria", "marche", "abruzzo", "italy", "italia"
    ]

def is_italian_location(location):
    if pd.isna(location):
        return False
    location = location.lower()
    if any(keyword in location for keyword in italian_keywords):
        return True
    
    # Check if the location has alphanumeric content before attempting fuzzy matching
    if any(char.isalnum() for char in location):
        closest_match, score = process.extractOne(location, italian_keywords)
        return score > 80
    else:
        return False

# Assuming df is defined and contains the relevant data
# Avoid computing the condition multiple times by storing it in a variable
is_italian_mask = df['actor'].apply(lambda x: is_italian_location(x.get('location')))

# Filter out rows with Italian locations
non_italian_df = df[~is_italian_mask]
# Filter in rows with Italian locations
italian_df = df[is_italian_mask]

# Flatten the 'actor' column from the italian_df
flattened_italian_actor_df = pd.json_normalize(italian_df['actor'].tolist())

# Display the flattened 'actor' column
flattened_italian_actor_df.to_csv('data/flattened_italian_actor_df.csv', index=False)
flattened_italian_actor_df.head()

Unnamed: 0,id,login,display_login,gravatar_id,url,avatar_url,location
0,16925025,maffo102,maffo102,,https://api.github.com/users/maffo102,https://avatars.githubusercontent.com/u/16925025?,Italy
1,30238962,merkleID,merkleID,,https://api.github.com/users/merkleID,https://avatars.githubusercontent.com/u/30238962?,milan
2,117077787,Nelexiad,Nelexiad,,https://api.github.com/users/Nelexiad,https://avatars.githubusercontent.com/u/117077...,Palermo
3,99094086,CrisLap,CrisLap,,https://api.github.com/users/CrisLap,https://avatars.githubusercontent.com/u/99094086?,"Verona, Italy"
4,84925446,davidetacchini,davidetacchini,,https://api.github.com/users/davidetacchini,https://avatars.githubusercontent.com/u/84925446?,Italy


-------

## 2. E-Mail ends with .it

In [18]:
# Assuming you have a dataframe df with a "payload" column
# Create a new dataframe with extracted commit information
def extract_commit_info(payload):
    if 'commits' in payload:
        commits = payload['commits']
        if commits:
            first_commit = commits[0]
            return {
                'repository_id': payload.get('repository_id'),
                'push_id': payload.get('push_id'),
                'author_email': first_commit['author']['email'],
                'author_name': first_commit['author']['name'],
                'commit_message': first_commit['message'],
                'commit_sha': first_commit['sha']
            }
    return None

# Apply the function to the "payload" column to extract commit information
df['commit_info'] = df['payload'].apply(extract_commit_info)

# Filter out None entries before applying Series constructor
non_none_commit_info = df['commit_info'][df['commit_info'].notna()]

# Create a new dataframe from the non-None extracted information
df_commit_info = non_none_commit_info.apply(pd.Series)

# Now df_commit_info contains the extracted commit information
italian_emails = df_commit_info.copy()

# Remove NANs
italian_emails = italian_emails.dropna(subset=['author_email', 'author_name', 'repository_id'])

# Ensure email column is string type
italian_emails['author_email'] = italian_emails['author_email'].astype(str)

# Filter for Italian emails
email_address_it = italian_emails[italian_emails['author_email'].str.endswith('.it')]

email_address_it.to_csv('data/email_address_it.csv', index=False)


In [20]:
email_address_it.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 1465 to 159465
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   repository_id   154 non-null    int64 
 1   push_id         154 non-null    int64 
 2   author_email    154 non-null    object
 3   author_name     154 non-null    object
 4   commit_message  154 non-null    object
 5   commit_sha      154 non-null    object
dtypes: int64(2), object(4)
memory usage: 8.4+ KB


## 3. Analyzing org descriptions
Use GraphQL to get more information at once

!pip install aiohttp
!pip install asyncio
!pip install nest_asyncio

In [54]:
## try 3
import aiohttp
import asyncio
import pandas as pd

unique_orgs = pd.Series(df['org'].dropna().apply(lambda x: x['login'])).unique()

# Batch size
BATCH_SIZE = 500

def construct_query(org_logins_batch):
    return '{' + ' '.join(
        f'''
        org{index}: organization(login: "{org_login}") {{
            login
            description
        }}
        ''' for index, org_login in enumerate(org_logins_batch)
    ) + '}'

async def fetch_data(query, session):
    headers = {
        'Authorization': f'bearer {github_token}',
        'Content-Type': 'application/json'
    }
    async with session.post('https://api.github.com/graphql', json={'query': query}, headers=headers) as response:
        response_json = await response.json()
        if 'errors' in response_json:
            #print(f"Warning: {response_json['errors']}")
        return response_json.get('data', {})

async def fetch_org_data(org_logins_batch, session):
    query = construct_query(org_logins_batch)
    return await fetch_data(query, session)

async def main():
    data = []

    async with aiohttp.ClientSession() as session:
        for i in range(0, len(unique_orgs), BATCH_SIZE):
            org_logins_batch = unique_orgs[i:i + BATCH_SIZE]
            batch_data = await fetch_org_data(org_logins_batch, session)
            for org_data in batch_data.values():
                if org_data:
                    data.append([org_data['login'], org_data['description']])
            
    df_orgs = pd.DataFrame(data, columns=['login', 'description'])
    return df_orgs  # return DataFrame

# Run the event loop and assign the result to a variable
df_orgs = asyncio.run(main())




In [71]:
df_orgs.to_csv('data/df_orgs.csv', index=False)

df_orgs.head()

Unnamed: 0,login,description,language
0,stdlib-js,Standard library for JavaScript.,en
1,Lombiq,A software and services company focusing on we...,en
2,CollaboraOnline,"Home of Collabora Online, the cloud-based offi...",en
3,nextcloud,📱☁️💻 A safe home for all your data – community...,en
4,tf2pickup-org,Team Fortress 2 pick-up games for everyone,en


### Lingua
https://github.com/pemistahl/lingua-py

In [56]:
from lingua import Language, LanguageDetectorBuilder
import pandas as pd

df_lingua = df_orgs.copy()

# Available languages
languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH, Language.ITALIAN]

# Build the language detector
detector = LanguageDetectorBuilder.from_languages(*languages).build()

# Function to detect language
def detect_language(text):
    try:
        return detector.detect_language_of(text).name
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_lingua['language'] = df_lingua['description'].apply(detect_language)

# Display the DataFrame
df_lingua[df_lingua["language"]=="ITALIAN"].head()


Unnamed: 0,login,description,language
390,xDroidOSS-Pixel,Credit : xdroidOSS (mnmlist),ITALIAN
403,polito-WA1-AW1-2023,Courses at Politecnico di Torino - Academic Ye...,ITALIAN
703,MOVIECORD,🎬 Movie | Film Festival | OTT,ITALIAN
1043,realme-mt6781-dev,AOSP Sources For Realme 8i/Narzo 50,ITALIAN
1054,ipfs,A peer-to-peer hypermedia protocol,ITALIAN


### Langdetect

In [57]:
from langdetect import detect

df_langdetect = df_orgs.copy()

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_langdetect['language'] = df_langdetect['description'].apply(detect_language)

# Display the DataFrame
df_langdetect[df_langdetect["language"] == "it"]

Unnamed: 0,login,description,language
241,TYPO3-Documentation,Official TYPO3 Documentation,it
378,project-violet,Violation Violet,it
616,PositiveSumNet,non-commercial use,it
703,MOVIECORD,🎬 Movie | Film Festival | OTT,it
715,nomic-ai,democratizing access to powerful artificial in...,it
1378,RR0,UFO data since 1998,it
1455,CreaMate-Consulting,smart digital solutions,it
1662,JuliaPlots,Data visualization in Julia,it
1833,theia-ailabs,AI Voice Assistant,it
1958,panoramicdata,Panoramic Data Limited,it


#### Spacy

In [68]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import pandas as pd

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector()

# Add the language detector to the pipeline
nlp.add_pipe('language_detector', last=True)

# A copy of your DataFrame to work with
df_spacy = df_orgs.copy()

# Function to detect language using spaCy
def detect_language(text):
    try:
        doc = nlp(text)
        return doc._.language['language']
    except:
        return "unknown"

# Apply the function to the 'description' column and store the result in a new column 'language'
df_spacy['language'] = df_spacy['description'].apply(detect_language)

# Display the DataFrame with Italian descriptions
print(df_spacy[df_spacy["language"] == "it"])


                      login  \
241     TYPO3-Documentation   
296                  juspay   
378          project-violet   
616          PositiveSumNet   
715                nomic-ai   
1378                    RR0   
1455    CreaMate-Consulting   
1662             JuliaPlots   
1833           theia-ailabs   
1958          panoramicdata   
2144    dripnillbyteskidrip   
2966                  bokeh   
3025         EazyAutodelete   
3355  softeng2223-inf-uniba   
3850        sangria-graphql   
4024              useVenice   
4324                CMU-HKN   
4559                rhasspy   
4748                 FNNDSC   
5614               TdP-2023   
5646         mozilla-mobile   
6279                   piql   
6409        Chaox-Community   
6468        DevStyleDigital   
6543               PizzaDAO   
6705           vueComponent   
6840                  ettle   
6953                 toml-f   

                                            description language  
241                        Offici

In [70]:
df_spacy[df_spacy["language"] == "it"].head()

Unnamed: 0,login,description,language
241,TYPO3-Documentation,Official TYPO3 Documentation,it
296,juspay,Design to simplify. Revolutionizing digital p...,it
378,project-violet,Violation Violet,it
616,PositiveSumNet,non-commercial use,it
715,nomic-ai,democratizing access to powerful artificial in...,it


## 7. Italian Open Source Licences
Problem is, that the authors are italian but not living in Italy!!

In [19]:
import requests
import re
import os

def get_and_process_github_readme(url, filename, token):
    """
    Download the README.md from GitHub, save it, extract and return the names of repositories.
    
    Parameters:
        url (str): The URL to the raw README.md on GitHub.
        filename (str): The name to save the README.md as.
        token (str): Your GitHub API token.
        
    Returns:
        list: Extracted repository names.
    """
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3.raw"
    }
    
    # Step 1: Download the README.md from GitHub
    response = requests.get(url, headers=headers)
    
    # Ensure the request was successful
    if response.status_code == 200:
        readme_content = response.text
        
        # Step 2: Process the saved file to extract repository names
        name_pattern = re.compile(r'\|\s*(?:\[(?P<link_name>.+?)\]\(.+?\)|(?P<plain_name>[^\|\[\]]+))\s*\|')
        names = []
        for line in readme_content.split('\n'):
            match = name_pattern.search(line)
            if match:
                name = (match.group('link_name') or match.group('plain_name')).strip()
                if name and name.lower() not in ["name", "----", "stack", "description"]:
                    names.append(name)
        return names
    else:
        raise ConnectionError(f"Failed to fetch README.md, status code: {response.status_code}")

# URL to the raw README.md on GitHub
readme_url = "https://github.com/italia-opensource/awesome-italia-opensource/raw/main/awesome/opensource/README.md"

# Filename to save the README.md as
save_filename = "github_readme.md"

# Execute the function and display the first 10 repository names
Italian_OS_projects = get_and_process_github_readme(readme_url, save_filename, github_token)[1:]
Italian_OS_projects[:10]

['2Ami',
 'Apivault',
 'Arduino',
 'Arduino Desk Weatherstation',
 'Argon',
 'Autocannon',
 'Awesome Italia Open Source',
 'Bootstrap Italia',
 'Breathly',
 'Cache Candidate']

## Extras

### downloading data for multiple days/hours

In [None]:
import os
import requests
import gzip
import pandas as pd
import logging
from typing import List, Union

# Configure logging
logging.basicConfig(level=logging.INFO)

BASE_URL = "https://data.gharchive.org"

def ensure_directory_exists(dir_name: str) -> None:
    """Ensure the specified directory exists."""
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

def download_file(url: str, file_path: str) -> None:
    """Download file and save it to the specified path."""
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=128):
                    f.write(chunk)
        else:
            logging.warning(f"File not found: {url}")
    except requests.RequestException as e:
        logging.error(f"Failed to fetch {url}: {str(e)}")

def load_or_fetch_data(file_path: str, url: str) -> Union[pd.DataFrame, None]:
    """Load data from file or fetch from URL if not exists."""
    if os.path.exists(file_path):
        logging.info(f"File {file_path} already exists. Loading data.")
        with gzip.open(file_path, 'rt', encoding='utf-8') as gz:
            return pd.read_json(gz, lines=True)
    else:
        logging.info(f"File {file_path} not exists. Downloading from {url}")
        download_file(url, file_path)
        if os.path.exists(file_path):
            with gzip.open(file_path, 'rt', encoding='utf-8') as gz:
                return pd.read_json(gz, lines=True)
    return None

def download_gh_archive(start_day: int, end_day: int, 
                        start_hour: int=0, end_hour: int=23,
                        base_url: str=BASE_URL, data_dir: str='data') -> pd.DataFrame:
    """Download GitHub archive data for specified days and hours."""
    dfs = []
    ensure_directory_exists(data_dir)
    
    for day in range(start_day, end_day + 1):
        for hour in range(start_hour, end_hour + 1):
            filename = f"2023-04-{day:02d}-{hour}.json.gz"
            url = f"{base_url}/{filename}"
            file_path = os.path.join(data_dir, filename)
            
            df = load_or_fetch_data(file_path, url)
            if df is not None:
                dfs.append(df)

    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [None]:
# Record the start time
start_time = time.time()
#Define the start and end date-time
start_date_time = "2023-04-01 10"  # Format: "YYYY-MM-DD HH"
end_date_time = "2023-04-01 12"    # Format: "YYYY-MM-DD HH"
# Extract day and hour from the date-time strings
start_day = int(start_date_time.split("-")[2].split()[0])
start_hour = int(start_date_time.split()[1])
end_day = int(end_date_time.split("-")[2].split()[0])
end_hour = int(end_date_time.split()[1])

# Download data for the specified time frame
df_big = download_gh_archive(start_day, end_day, start_hour, end_hour)

# Record the end time and calculate the elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Data from {start_date_time} to {end_date_time} loaded into DataFrame!")
print(f"Time taken: {elapsed_time:.2f} seconds")

### Getting rid of Github Bot commits

In [None]:
filtered_df = df[~df['actor'].apply(lambda x: x.get('login').endswith('[bot]'))]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.head()