In [None]:
%pip install python-dotenv
%pip install requests
%pip install gql
%pip install requests-toolbelt
%pip install pandas
%pip install fasttext
%pip install numpy
%pip install scikit-learn
%pip install matplotlib
%pip install seaborn
%pip install nltk
%pip install fpdf

In [2]:
import os
from dotenv import load_dotenv
from gql import Client, gql
from gql.transport.requests import RequestsHTTPTransport
import re
import pandas as pd
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt 
import numpy as np
import random
import time
from gensim.models import KeyedVectors
from fpdf import FPDF

In [3]:
#carrega a chave da API
load_dotenv("../.env")
token = os.getenv('GH_TOKEN')
if token is None:
    raise ValueError("GitHub token is not set. Check your .env file.")

In [4]:
# load the Full taxonomy

# TO DO: load the full taxonomy from the csv file
# TO DO: create a dictionary with the taxonomy in the pdf

# example taxonomy
taxonomy = {
    'security': ['security', 'vulnerability', 'cve', 'exploit', 'threat', 'attack', 'defense', 'secure', 'risk', 'breach', 'privacy', 'protection', 'cybersecurity', 'secure', 'securely', 'secureness', 'securest', 'securement', 'securements', 'securely' ],
    'privacy': ['privacy', 'private', 'personal', 'data', 'pii', 'gdpr', 'ccpa'],
    'compliance': ['compliance', 'regulation', 'regulatory', 'law', 'legal', 'audit', 'certification', 'certify', 'certified', 'certifies', 'certifying', 'certifications', 'certificating', 'certificated'],
    'ethical': ['ethical', 'ethics', 'moral', 'morality', 'unethical', 'immoral', 'amoral', 'unethically', 'immorally', 'amorally', 'ethically', 'morally', 'ethics', 'morals', 'ethicality', 'ethicalness', 'ethic', 'ethics', 'ethicist', 'ethicists', 'ethicize', 'ethicized', 'ethicizes', 'ethicizing', 'ethicise', 'ethicised', 'ethicises', 'ethicising', 'ethicizable', 'ethicization', 'ethicizations', 'ethicize', 'ethicized', 'ethicizes', 'ethicizing', 'ethicise', 'ethicised', 'ethicises', 'ethicising', 'ethicizable', 'ethicization', 'ethicizations'],
}

In [5]:
# Initialize the transport and client
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

transport = RequestsHTTPTransport(
    url='https://api.github.com/graphql',
    headers={'Authorization': f'token {token}'}
)
client = Client(transport=transport, fetch_schema_from_transport=False)

# Query generator
def query_generator(name, owner, first=100, after=None):
    """
    Generate a GraphQL query for fetching issues from a GitHub repository.

    Parameters:
    name (str): The name of the repository.
    owner (str): The owner of the repository.
    first (int): Number of issues to fetch per page. Default is 100.
    after (str): Cursor for pagination. Default is None.

    Returns:
    gql: A GraphQL query object.
    """
    query = gql('''
    query GetRepositoryInfo($name: String!, $owner: String!, $first: Int!, $after: String) {
        repository(owner: $owner, name: $name) {
            name
            description
            issues(first: $first, after: $after) {
                edges {
                    node {
                        body
                        createdAt
                    }
                }
                pageInfo {
                    hasNextPage
                    endCursor
                }
            }
        }
    }
    ''')
    return query


In [6]:
# Fetch the issues from a repo and generate a data matrix
def fetch_issues(query_issues):
    data_matrix = []  # [issue body, createdAt]

    # Execute the queries with pagination using variables
    try:
        issue_cursor = None 

        # Pagination loop for issues
        while True:
            # Execute the query for issues
            issues = client.execute(query_issues, variable_values={'issueCursor': issue_cursor})

            # Process the issues
            for issue in issues['repository']['issues']['edges']:
                issue_node = issue['node']
                body_text = issue_node['body']
                created_at = issue_node['createdAt']

                if body_text:  # Only add issues that have a body
                    data_matrix.append([body_text, created_at])

            # Check if there are more pages to fetch
            issue_page_info = issues['repository']['issues']['pageInfo']
            if not issue_page_info['hasNextPage']:
                print("No more issues pages to fetch.")
                break

            # Get the cursor to fetch the next page of issues
            issue_cursor = issue_page_info['endCursor']
            
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

    # Output the collected issue data
    print(f"Total issues collected: {len(data_matrix)}")
    return data_matrix


In [8]:
from gensim.models.fasttext import load_facebook_vectors
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')

# Load pre-trained FastText model
fasttext_model = load_facebook_vectors("cc.en.300.bin")  # Replace with the correct path to your FastText model

# Function to get the average word vector for a phrase
def get_average_word_vector(phrase, model):
    words = word_tokenize(phrase.lower())
    word_vectors = []

    # Use model.get_vector to get a vector for any word (in or out of vocabulary)
    for word in words:
        try:
            word_vectors.append(model.get_vector(word))
        except KeyError:
            # FastText rarely raises KeyError because it can infer vectors for OOV words
            continue

    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Function to check if a text matches any category in the taxonomy
def isInContext(text, taxonomy, threshold=0.4):
    # Get the average word vector for the text
    text_vector = get_average_word_vector(text, fasttext_model) 

    for category, keywords in taxonomy.items():
        # Get the average word vector for all keywords in the category
        category_vectors = [get_average_word_vector(keyword, fasttext_model) for keyword in keywords]
        category_mean_vector = np.mean(category_vectors, axis=0)

        # Compute cosine similarity between the text vector and the category vector
        similarity = cosine_similarity([text_vector], [category_mean_vector])[0][0]

        # If similarity exceeds threshold, consider the text in context
        if similarity > threshold:
            return True, category  # Return True and the matched category (can be used for further processing)

    return False, None  # Return False if no category matches


ModuleNotFoundError: No module named 'fasttext'

In [None]:
# Function to extract messages from the issues result
def extractMessagesFromIssues(issue_result):
    messages_matrix = []

    # Get the 'edges' list from the issues result
    edges = issue_result['repository']['issues']['edges']

    # Loop through the edges to extract the body and createdAt
    for edge in edges:
        node = edge['node']
        body = node.get('body', '').strip()
        created_at = node.get('createdAt', '') 

        # Only append if the body has content (non-empty string)
        if body:
            messages_matrix.append([body, created_at])
    print(f"Total messages extracted: {len(messages_matrix)}")
    return messages_matrix

In [None]:
# Generate a number of messages graph from the messagesInContext list
def getMessagesPerYearGraph(messagesInContext):
        
    # Criar um DataFrame a partir das mensagens no contexto
    df = pd.DataFrame(messagesInContext, columns=['Message', 'CreatedAt'])

    # Converter a coluna 'CreatedAt' para o tipo datetime
    df['CreatedAt'] = pd.to_datetime(df['CreatedAt'])

    # Extrair o ano da coluna 'CreatedAt'
    df['Year'] = df['CreatedAt'].dt.year  # Usando .dt.year para obter o ano diretamente

    # Agrupar o DataFrame por 'Year' e contar o número de mensagens
    yearly_messages = df.groupby('Year').size()

    # Plotar o número de mensagens por ano
    ax = yearly_messages.plot(kind='bar', title='Número de Mensagens no Contexto por Ano')
    ax.set_xlabel('Ano')
    ax.set_ylabel('Número de Mensagens')

    # Rotacionar os rótulos do eixo X na vertical
    plt.xticks(rotation=90)

    return ax

In [None]:
# Generate a percent of messages graph from the messagesInContext
def getMessagesPerYearGraphPercent(messagesInContext, data_matrix):

    # Criar um DataFrame a partir das mensagens no contexto
    df_context = pd.DataFrame(messagesInContext, columns=['Message', 'CreatedAt'])

    # Criar um DataFrame a partir do total de mensagens
    df_total = pd.DataFrame(data_matrix, columns=['Message', 'CreatedAt'])

    # Converter a coluna 'CreatedAt' para o tipo datetime para ambos os DataFrames
    df_context['CreatedAt'] = pd.to_datetime(df_context['CreatedAt'])
    df_total['CreatedAt'] = pd.to_datetime(df_total['CreatedAt'])

    # Extrair o ano da coluna 'CreatedAt'
    df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
    df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')

    # Agrupar os DataFrames por 'Year' e contar o número de mensagens
    yearly_context_messages = df_context.groupby('Year').size()
    yearly_total_messages = df_total.groupby('Year').size()

    # Calcular a porcentagem de mensagens no contexto em relação ao total por ano
    yearly_percentage = (yearly_context_messages / yearly_total_messages) * 100

    # Plotar a porcentagem de mensagens por ano
    ax = yearly_percentage.plot(kind='bar', title='Porcentagem de Mensagens no Contexto por Ano')
    ax.set_xlabel('Ano')
    ax.set_ylabel('Porcentagem de Mensagens (%)')

    # Rotacionar os rótulos do eixo X na vertical
    plt.xticks(rotation=90)

    return ax


In [None]:
# Generate a sample of messages graph from the messagesInContext
def sampleGenerator(messagesInContext):
    # Definir o tamanho da amostra
    sample_size = 2
    
    # Get 2 random strings 
    sample = random.sample(messagesInContext, sample_size)
    
    semple_string= f'Sample of {sample_size} messages in context:\n' + sample[0][0] + '\n' + sample[1][0]
    
    return semple_string

In [None]:
from fpdf import FPDF

class PDF(FPDF):
    def __init__(self):
        super().__init__()
        self.set_auto_page_break(auto=True, margin=15)
        self.add_font("DejaVu", fname="DejaVuSans.ttf", uni=True)
        self.set_font("DejaVu", size=12)

    @staticmethod
    def sanitize_text(text):
        """Sanitize text to remove unsupported characters."""
        return ''.join(c if ord(c) < 128 else '?' for c in text)

    def add_repository_page(self, name, owner, graph_mpy_path, graph_mpyp_path, sample_text):
        # Add a new page for the repository
        self.add_page()
        
        # Sanitize input text
        name = self.sanitize_text(name)
        owner = self.sanitize_text(owner)
        sample_text = self.sanitize_text(sample_text)
        
        # Title
        self.cell(0, 10, f"Repository: {name} (Owner: {owner})", ln=True, align='C')
        self.ln(10)
        
        # Add graphs
        try:
            self.image(graph_mpy_path, x=10, y=self.get_y(), w=90)
            self.set_y(self.get_y() + 90)
        except RuntimeError as e:
            self.cell(0, 10, "Error loading graph_mpy image.", ln=True)
        
        try:
            self.image(graph_mpyp_path, x=10, y=self.get_y(), w=90)
            self.set_y(self.get_y() + 90)
        except RuntimeError as e:
            self.cell(0, 10, "Error loading graph_mpyp image.", ln=True)
        
        # Add the sample text
        self.ln(10)
        self.multi_cell(0, 10, f"Sample:\n{sample_text}")


In [None]:
import pandas as pd
import time
import random  # Optional: for simulating occasional failures during testing
import matplotlib.pyplot as plt

# Retry logic function
def execute_with_retries(client, query, variables, max_retries=5, delay=2):
    """
    Executes a GraphQL query with retry logic.
    
    Parameters:
        client (object): The GraphQL client.
        query (str): The GraphQL query string.
        variables (dict): The query variables.
        max_retries (int): Maximum number of retries.
        delay (int): Delay in seconds between retries.
        
    Returns:
        dict: The result of the GraphQL query.
    """
    attempt = 0
    while attempt < max_retries:
        try:
            # Execute the query
            return client.execute(query, variable_values=variables)
        except ConnectionError as e:
            attempt += 1
            print(f"Connection error occurred: {e}. Attempt {attempt}/{max_retries}. Retrying in {delay} seconds...")
            time.sleep(delay)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            raise  # Re-raise non-connection-related exceptions
    print(f"Failed to execute query after {max_retries} retries.")
    raise ConnectionError("Max retries exceeded.")

# Load the filtered repositories data
df = pd.read_csv('filtered_repositories_over_200_issues.csv')
print(df.head())

# Initialize the PDF
pdf = PDF()

# Iterate through each repository in the dataset
for index, row in df.iterrows():
    # Extract repository details
    name = row['repo']
    owner = row['owner']
    
    print(f"Processing repository: {name} (Owner: {owner})")
    issue_cursor = None
    data_matrix = []  # [issue body, createdAt]

    # Fetch issues with retry logic
    while True:
        query = query_generator(name, owner, first=100, after=issue_cursor)
        variables = {'name': name, 'owner': owner, 'first': 100, 'after': issue_cursor}
        
        try:
            issues = execute_with_retries(client, query, variables)
        except ConnectionError:
            print(f"Failed to fetch issues for repository {name}. Moving to next repository.")
            break  # Exit the loop for this repository
        
        # Process the issues
        for issue in issues['repository']['issues']['edges']:
            issue_node = issue['node']
            body_text = issue_node['body']
            created_at = issue_node['createdAt']
            if body_text:
                data_matrix.append([body_text, created_at])
        
        # Check if there are more pages to fetch
        issue_page_info = issues['repository']['issues']['pageInfo']
        if not issue_page_info['hasNextPage']:
            print("No more issues pages to fetch.")
            break
        
        # Get the cursor for the next page
        issue_cursor = issue_page_info['endCursor']
    
    print(f"Data size: {len(data_matrix)}")
    
    # Filter messages based on context using the taxonomy
    messages_in_context = []
    for message in data_matrix:
        text = message[0]
        in_context, category = isInContext(text, taxonomy)
        if in_context:
            messages_in_context.append([text, message[1]])
    
    print(f"Messages in context: {len(messages_in_context)}")
    
    # Generate and save visualizations
    if messages_in_context:
        # Messages per year graph
        graph_mpy_path = f"graphs/{name}_messages_per_year.png"
        graph_mpy_ax = getMessagesPerYearGraph(messages_in_context)
        plt.savefig(graph_mpy_path)
        plt.close(graph_mpy_ax.figure)
        
        # Messages per year percentage graph
        graph_mpyp_path = f"graphs/{name}_messages_per_year_percent.png"
        graph_mpyp_ax = getMessagesPerYearGraphPercent(messages_in_context, data_matrix)
        plt.savefig(graph_mpyp_path)
        plt.close(graph_mpyp_ax.figure)
        
        # Generate the sample text
        sample_text = sampleGenerator(messages_in_context)
    else:
        # Handle cases with no relevant messages
        erro_msg = "No messages in context" if data_matrix else "No issues found"
        
        # Generate empty graphs with an appropriate title
        graph_mpy_path = f"graphs/{name}_messages_per_year.png"
        plt.figure()
        plt.title(erro_msg)
        plt.savefig(graph_mpy_path)
        plt.close()
        
        graph_mpyp_path = f"graphs/{name}_messages_per_year_percent.png"
        plt.figure()
        plt.title(erro_msg)
        plt.savefig(graph_mpyp_path)
        plt.close()
        
        sample_text = erro_msg
    
    # Add repository data to the PDF
    pdf.add_repository_page(name, owner, graph_mpy_path, graph_mpyp_path, sample_text)
    
    # Pause briefly to prevent API rate limits or overloading
    time.sleep(1)

# Save the PDF report
output_pdf_path = "full_report.pdf"
pdf.output(output_pdf_path)
print(f"PDF report saved to {output_pdf_path}")


        name symbol       owner          repo  total_issues
0  Avalanche   AVAX    ava-labs   avalanchego           244
1       Beam   BEAM      BeamMW          beam           237
2    Bitcoin    BTC     bitcoin       bitcoin           653
3   Dogecoin   DOGE    dogecoin      dogecoin           227
4   Polkadot    DOT  paritytech  polkadot-sdk          1784
Processing repository: avalanchego (Owner: ava-labs)
No more issues pages to fetch.
Data size: 771
Messages in context: 512


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: beam (Owner: BeamMW)
No more issues pages to fetch.
Data size: 1610
Messages in context: 965


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: bitcoin (Owner: bitcoin)
No more issues pages to fetch.
Data size: 8115
Messages in context: 4579


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: dogecoin (Owner: dogecoin)
No more issues pages to fetch.
Data size: 1240
Messages in context: 703


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: polkadot-sdk (Owner: paritytech)
No more issues pages to fetch.
Data size: 2581
Messages in context: 1804


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: go-ethereum (Owner: ethereum)
No more issues pages to fetch.
Data size: 8035
Messages in context: 3781


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: ic (Owner: dfinity)
No more issues pages to fetch.
Data size: 0
Messages in context: 0
Processing repository: chainlink (Owner: smartcontractkit)
No more issues pages to fetch.
Data size: 428
Messages in context: 321


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: neo (Owner: neo-project)
No more issues pages to fetch.
Data size: 1409
Messages in context: 874


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: stellar-core (Owner: stellar)
No more issues pages to fetch.
Data size: 1635
Messages in context: 986


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: monero (Owner: monero-project)
No more issues pages to fetch.
Data size: 3066
Messages in context: 1706


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: rippled (Owner: ripple)
No more issues pages to fetch.
Data size: 1384
Messages in context: 869


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: metamask-extension (Owner: MetaMask)
No more issues pages to fetch.
Data size: 11278
Messages in context: 6304


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: mina (Owner: MinaProtocol)
No more issues pages to fetch.
Data size: 4462
Messages in context: 2454


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: osmosis (Owner: osmosis-labs)
No more issues pages to fetch.
Data size: 2160
Messages in context: 695


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: oasis-core (Owner: oasisprotocol)
No more issues pages to fetch.
Data size: 1814
Messages in context: 1248


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: zcash (Owner: zcash)
No more issues pages to fetch.
Data size: 3576
Messages in context: 2010


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: storj (Owner: storj)
No more issues pages to fetch.
Data size: 2593
Messages in context: 1856


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: chia-blockchain (Owner: Chia-Network)
No more issues pages to fetch.
Data size: 4911
Messages in context: 2049


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: eos (Owner: EOSIO)
No more issues pages to fetch.
Data size: 4848
Messages in context: 2574


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: cowswap (Owner: gnosis)
No more issues pages to fetch.
Data size: 1224
Messages in context: 730


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: mobilecoin (Owner: mobilecoinfoundation)
No more issues pages to fetch.
Data size: 402
Messages in context: 235


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: steem (Owner: steemit)
No more issues pages to fetch.
Data size: 2129
Messages in context: 1007


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: steem (Owner: steemit)
No more issues pages to fetch.
Data size: 2129
Messages in context: 1007


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: open-chat (Owner: open-chat-labs)
No more issues pages to fetch.
Data size: 533
Messages in context: 165


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: origin-dollar (Owner: OriginProtocol)
No more issues pages to fetch.
Data size: 707
Messages in context: 453


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: ic (Owner: dfinity)
No more issues pages to fetch.
Data size: 0
Messages in context: 0
Processing repository: red (Owner: red)
No more issues pages to fetch.
Data size: 4002
Messages in context: 439


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: go-ethereum (Owner: ethereum)
No more issues pages to fetch.
Data size: 8035
Messages in context: 3781


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: agoric-sdk (Owner: Agoric)
No more issues pages to fetch.
Data size: 4371
Messages in context: 3000


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


Processing repository: holochain-rust (Owner: holochain)
No more issues pages to fetch.
Data size: 488
Messages in context: 211


  df_context['Year'] = df_context['CreatedAt'].dt.to_period('Y')
  df_total['Year'] = df_total['CreatedAt'].dt.to_period('Y')


PDF report saved to full_report.pdf
