This is the 2nd Lab

In [None]:
import csv
import datetime
import os
import subprocess
import time
import pandas as pd
import requests
import matplotlib.pyplot as plt
import numpy as np

token = "token"

def get_popular_java_repos(total_repos):
    url = "https://api.github.com/graphql"
    headers = {"Authorization": f"Bearer {token}"}
    all_repos = []
    end_cursor = None
    max_retries = 5
    retry_delay = 5

    while len(all_repos) < total_repos:
        remaining_repos = total_repos - len(all_repos)
        first = min(remaining_repos, 10)

        query = f"""
        {{
            search(query: "language:Java stars:>1", type: REPOSITORY, first: {first}, after: {f'"{end_cursor}"' if end_cursor else 'null'}) {{
                edges {{
                    node {{
                        ... on Repository {{
                          name
                          url
                          stargazers {{
                            totalCount
                          }}
                          owner {{
                            login
                          }}
                          createdAt
                          pushedAt
                          releases {{
                            totalCount
                          }}
                        }}
                    }}
                }}
                pageInfo {{
                    endCursor
                    hasNextPage
                }}
            }}
        }}
        """
        try:
            for attempt in range(max_retries):
                response = requests.post(url, json={'query': query}, headers=headers)
                if response.status_code == 200:
                    data = response.json()
                    search_results = data.get("data", {}).get("search", {})
                    if search_results:
                        edges = search_results.get("edges", [])
                        all_repos.extend(edges)
                        end_cursor = search_results.get("pageInfo", {}).get("endCursor")
                        if not search_results.get("pageInfo", {}).get("hasNextPage", False):
                            return all_repos
                        print(f"Progress: Found {len(all_repos)} repositories.")
                    break
                elif response.status_code in [502, 503, 504]:
                    print(f"Error {response.status_code}: Attempt {attempt + 1} of {max_retries}. Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                    retry_delay *= 2
                elif response.status_code == 429:
                    print("Rate limit exceeded. Waiting for the reset time.")
                    reset_time = int(response.headers.get('X-RateLimit-Reset', time.time()))
                    wait_time = max(reset_time - time.time(), 0)
                    time.sleep(wait_time)
                else:
                    raise Exception(f"Failed to fetch repositories: {response.status_code}")
            else:
                raise Exception("Max retries reached, aborting.")
        except Exception as e:
            print(f"Error: {e}")
    return all_repos


def calculate_quality_metrics(class_file):
    if not os.path.exists(class_file):
        print(f"Arquivo {class_file} não existe.")
        return -1, -1, -1, -1
    
    if os.stat(class_file).st_size == 0:
        print(f"Arquivo {class_file} está vazio.")
        return -1, -1, -1, -1
    
    total_cbo = total_dit = total_lcom = loc_total = repo_count = 0
    with open(class_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            total_cbo += int(row.get('cbo', 0))
            total_dit += int(row.get('dit', 0))
            total_lcom += int(row.get('lcom', 0))
            loc_total += int(row.get('loc', 0))
            repo_count += 1
    if repo_count == 0:
        print(f"Nenhum dado encontrado no arquivo {class_file}.")
        return -1, -1, -1, -1
    return total_cbo, total_dit, total_lcom, loc_total



def export_to_csv(repos, filename="repos.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Name", "URL", "Stars", "Quantity of Releases", "Year Age", "Created At", "Pushed At"])
        for repo_edge in repos:
            repo = repo_edge["node"]
            created_at = datetime.datetime.strptime(repo['createdAt'], '%Y-%m-%dT%H:%M:%SZ')
            age_years = (datetime.datetime.now() - created_at).days / 365.25
            writer.writerow([
                repo["name"], repo["url"], 
                repo["stargazers"]["totalCount"],
                repo['releases']['totalCount'],
                age_years,
                repo["createdAt"], 
                repo["pushedAt"]
            ])
    print(f"Dados exportados para {filename} com sucesso.")


def calculate_process_metrics(repos):
    process_metrics = []
    for repo in repos:
        repo_node = repo['node']
        created_at = datetime.datetime.strptime(repo_node['createdAt'], '%Y-%m-%dT%H:%M:%SZ')
        age_years = (datetime.datetime.now() - created_at).days / 365.25
        releases_count = repo_node['releases']['totalCount']
        stargazers_count = repo_node['stargazers']['totalCount']
        
        process_metrics.append({
            'name': repo_node['name'],
            "stars": stargazers_count,
            'age_years': age_years,
            'releases_count': releases_count,
            'stargazers_count': stargazers_count
        })
    return process_metrics


def merge_metrics(process_metrics, quality_metrics):
    merged_data = []
    for process_metric in process_metrics:
        name = process_metric['name']
        quality_metric = quality_metrics.get(name, {})
        merged_data.append({
            'name': name,
            'popularity': process_metric['stars'],
            'maturity': process_metric['age_years'],
            "activity": process_metric['releases_count'],
            'loc_total': quality_metric['loc_total'],
            'avg_cbo': quality_metric['avg_cbo'],
            'avg_dit': quality_metric['avg_dit'],
            'avg_lcom': quality_metric['avg_lcom']
        })
    return merged_data


def clone_repo(repo_url, repo_name, repo_owner, failed_repos=[]):
    try:
        repo_path = f"./repos/{repo_owner}_{repo_name}"
        ck_output = f"./ck_output/{repo_owner}_{repo_name}"
        os.makedirs(os.path.dirname(repo_path), exist_ok=True)
        
        if os.path.exists(ck_output):
            print(f"Skipping {repo_owner}/{repo_name} as it was already cloned.")
            return False
        subprocess.run([
            "git", "clone", "--single-branch", "--no-tags", "--depth", "1", repo_url, repo_path
        ], check=True)
        print(f"Cloned {repo_owner}/{repo_name} successfully ({repo_url}).")
        return True 
    except subprocess.CalledProcessError as e:
        print(f"Error cloning {repo_owner}/{repo_name} -> {repo_url}: {e}.")
        failed_repos.append((repo_name, repo_url, repo_owner))
        return False


def create_result_dir(repo_path):
  try:
    output_dir = f"./ck_output/{repo_path}/"
    os.makedirs(output_dir, exist_ok=True)
  except Exception as e:
    print(f"Error running the creation directories: {e}")


def remove_repo(repo_path):
  try:
    subprocess.run([
      "rm", "-rf", f"./repos/{repo_path}"
    ], check=True)
    print(f"Removed {repo_path} successfully.")
  except subprocess.CalledProcessError as e:
    print(f"Error removing {repo_path}: {e}")
    

def remove_all_folders():
  try:
    subprocess.run([
      "rm", "-rf", "./repos"
    ], check=True)
    subprocess.run([
      "rm", "-rf", "./ck_output"
    ], check=True)
    print(f"Removed all folders successfully.")
  except subprocess.CalledProcessError as e:
    print(f"Error removing all folders: {e}")


def run_ck_on_repo(repo_path):
    try:
        repo_path_url = f"./repos/{repo_path}/"
        output_dir = f"./ck_output/{repo_path}/"
        if os.path.exists(output_dir) and os.path.exists(f"{output_dir}/class.csv"):
            print(f"Skipping CK on {repo_path} as it was already run.")
            return
        subprocess.run([
            "java", "-jar", "./ck.jar",
            repo_path_url, "true", "0", "true", output_dir
        ], check=True)
        print(f"Ran CK on {repo_path} successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error running CK on {repo_path}: {e}")


def retrieve_final_metrics(quantity=1000):
    repos = get_popular_java_repos(quantity)
    export_to_csv(repos)

    quality_metrics = {}
    for repo in repos:
        repo_url = repo["node"]["url"]
        repo_name = repo["node"]["name"]
        repo_owner = repo["node"]["owner"]["login"]
        
        repo_owner_name = f"{repo_owner}_{repo_name}"
        create_result_dir(repo_owner_name)

        failed_repos = []
        clone_repo(repo_url, repo_name, repo_owner, failed_repos=failed_repos)
        if failed_repos:
            print("\nRetrying failed repositories...\n")
            retry_failed_repos = []
            for repo_name, repo_url, repo_owner in failed_repos:
                clone_repo(repo_url, repo_name, repo_owner, failed_repos=retry_failed_repos)

            if retry_failed_repos:
                print("\nThe following repositories could not be cloned even after retries:")
                for repo_name, repo_url, repo_owner in retry_failed_repos:
                    print(f"Failed: {repo_name} ({repo_url})")
            else:
                print("\nAll previously failed repositories were successfully cloned.")
        run_ck_on_repo(repo_owner_name)
        remove_repo(repo_owner_name)
    
    for repo in repos:
        repo_owner = repo['node']['owner']['login']
        repo_name = repo['node']['name']
        class_file = f"./ck_output/{repo_owner}_{repo_name}/class.csv"
        quality_metrics_result = calculate_quality_metrics(class_file)
        if quality_metrics_result:
            avg_cbo, avg_dit, avg_lcom, loc_total = quality_metrics_result
            quality_metrics[repo_name] = {
                'avg_cbo': avg_cbo,
                'avg_dit': avg_dit,
                'avg_lcom': avg_lcom,
                'loc_total': loc_total
            }
            print(quality_metrics[repo_name])
        else:
            print(f"Ignoring {repo_name} due to missing or empty quality metrics.")

    process_metrics = calculate_process_metrics(repos)
    final_metrics = merge_metrics(process_metrics, quality_metrics)
    return final_metrics, repos


def summarize_data(df, columns):
    summary = {}
    for col in columns:
        summary[col] = {
            'median': np.median(df[col]),
            'mean': np.mean(df[col]),
            'std': np.std(df[col])
        }
    return pd.DataFrame(summary)


final_metrics, repos = retrieve_final_metrics()

Progress: Found 10 repositories.
Progress: Found 20 repositories.
Progress: Found 30 repositories.
Progress: Found 40 repositories.
Progress: Found 50 repositories.
Progress: Found 60 repositories.
Progress: Found 70 repositories.
Progress: Found 80 repositories.
Progress: Found 90 repositories.
Progress: Found 100 repositories.
Progress: Found 110 repositories.
Progress: Found 120 repositories.
Progress: Found 130 repositories.
Progress: Found 140 repositories.
Progress: Found 150 repositories.
Progress: Found 160 repositories.
Progress: Found 170 repositories.
Progress: Found 180 repositories.
Progress: Found 190 repositories.
Progress: Found 200 repositories.
Progress: Found 210 repositories.
Progress: Found 220 repositories.
Progress: Found 230 repositories.
Progress: Found 240 repositories.
Progress: Found 250 repositories.
Progress: Found 260 repositories.
Progress: Found 270 repositories.
Progress: Found 280 repositories.
Progress: Found 290 repositories.
Progress: Found 300 rep

FileNotFoundError: [WinError 2] O sistema não pode encontrar o arquivo especificado

In [12]:
final_metrics, repos = retrieve_final_metrics()
final_metrics_df = pd.DataFrame(final_metrics)
metrics_columns = ['popularity', 'maturity', 'activity', 'loc_total', 'avg_cbo', 'avg_dit', 'avg_lcom']
summary_df = summarize_data(final_metrics_df, metrics_columns)
print(summary_df)

Progress: Found 10 repositories.
Progress: Found 20 repositories.
Progress: Found 30 repositories.
Progress: Found 40 repositories.
Progress: Found 50 repositories.
Progress: Found 60 repositories.
Progress: Found 70 repositories.
Progress: Found 80 repositories.
Progress: Found 90 repositories.
Progress: Found 100 repositories.
Progress: Found 110 repositories.
Progress: Found 120 repositories.
Progress: Found 130 repositories.
Progress: Found 140 repositories.
Progress: Found 150 repositories.
Progress: Found 160 repositories.
Progress: Found 170 repositories.
Progress: Found 180 repositories.
Progress: Found 190 repositories.
Progress: Found 200 repositories.
Progress: Found 210 repositories.
Progress: Found 220 repositories.
Progress: Found 230 repositories.
Progress: Found 240 repositories.
Progress: Found 250 repositories.
Progress: Found 260 repositories.
Progress: Found 270 repositories.
Progress: Found 280 repositories.
Progress: Found 290 repositories.
Progress: Found 300 rep

FileNotFoundError: [WinError 2] O sistema não pode encontrar o arquivo especificado

In [2]:
df = pd.DataFrame(final_metrics)
df_filtered = df[(df['avg_cbo'] != -1) & (df['avg_dit'] != -1) & (df['avg_lcom'] != -1)]
correlation = df_filtered[['popularity', 'avg_cbo', 'avg_dit', 'avg_lcom']].corr()

print("Matriz de Correlação:")
print(correlation)

plt.figure(figsize=(12, 8))

# Gráfico de dispersão para CBO
plt.subplot(2, 2, 1)
plt.scatter(df_filtered['popularity'], df_filtered['avg_cbo'], alpha=0.5)
plt.title('Popularity vs. Avg CBO')
plt.xlabel('Popularity')
plt.ylabel('Avg CBO')

# Gráfico de dispersão para DIT
plt.subplot(2, 2, 2)
plt.scatter(df_filtered['popularity'], df_filtered['avg_dit'], alpha=0.5)
plt.title('Popularity vs. Avg DIT')
plt.xlabel('Popularity')
plt.ylabel('Avg DIT')

# Gráfico de dispersão para LCOM
plt.subplot(2, 2, 3)
plt.scatter(df_filtered['popularity'], df_filtered['avg_lcom'], alpha=0.5)
plt.title('Popularity vs. Avg LCOM')
plt.xlabel('Popularity')
plt.ylabel('Avg LCOM')

plt.tight_layout()
plt.show()

NameError: name 'final_metrics' is not defined

In [None]:
df = pd.DataFrame(final_metrics)
df_filtered = df[(df['avg_cbo'] != -1) & (df['avg_dit'] != -1) & (df['avg_lcom'] != -1)]
correlation = df_filtered[['maturity', 'avg_cbo', 'avg_dit', 'avg_lcom']].corr()

print("Matriz de Correlação:")
print(correlation)

plt.figure(figsize=(12, 8))

# Gráfico de dispersão para CBO
plt.subplot(2, 2, 1)
plt.scatter(df_filtered['maturity'], df_filtered['avg_cbo'], alpha=0.5)
plt.title('Maturity vs. Avg CBO')
plt.xlabel('Maturity')
plt.ylabel('Avg CBO')

# Gráfico de dispersão para DIT
plt.subplot(2, 2, 2)
plt.scatter(df_filtered['maturity'], df_filtered['avg_dit'], alpha=0.5)
plt.title('Maturity vs. Avg DIT')
plt.xlabel('Maturity')
plt.ylabel('Avg DIT')

# Gráfico de dispersão para LCOM
plt.subplot(2, 2, 3)
plt.scatter(df_filtered['maturity'], df_filtered['avg_lcom'], alpha=0.5)
plt.title('Maturity vs. Avg LCOM')
plt.xlabel('Maturity')
plt.ylabel('Avg LCOM')

plt.tight_layout()
plt.show()

In [None]:
df = pd.DataFrame(final_metrics)
df_filtered = df[(df['avg_cbo'] != -1) & (df['avg_dit'] != -1) & (df['avg_lcom'] != -1)]
correlation = df_filtered[['activity', 'avg_cbo', 'avg_dit', 'avg_lcom']].corr()

print("Matriz de Correlação:")
print(correlation)

plt.figure(figsize=(12, 8))

# Gráfico de dispersão para CBO
plt.subplot(2, 2, 1)
plt.scatter(df_filtered['activity'], df_filtered['avg_cbo'], alpha=0.5)
plt.title('Activity vs. Avg CBO')
plt.xlabel('Activity')
plt.ylabel('Avg CBO')

# Gráfico de dispersão para DIT
plt.subplot(2, 2, 2)
plt.scatter(df_filtered['activity'], df_filtered['avg_dit'], alpha=0.5)
plt.title('Activity vs. Avg DIT')
plt.xlabel('Activity')
plt.ylabel('Avg DIT')

# Gráfico de dispersão para LCOM
plt.subplot(2, 2, 3)
plt.scatter(df_filtered['activity'], df_filtered['avg_lcom'], alpha=0.5)
plt.title('Activity vs. Avg LCOM')
plt.xlabel('Activity')
plt.ylabel('Avg LCOM')

plt.tight_layout()
plt.show()

In [None]:
df = pd.DataFrame(final_metrics)
df_filtered = df[(df['avg_cbo'] != -1) & (df['avg_dit'] != -1) & (df['avg_lcom'] != -1)]
correlation = df_filtered[['loc_total', 'avg_cbo', 'avg_dit', 'avg_lcom']].corr()

print("Matriz de Correlação:")
print(correlation)

plt.figure(figsize=(12, 8))

# Gráfico de dispersão para CBO
plt.subplot(2, 2, 1)
plt.scatter(df_filtered['loc_total'], df_filtered['avg_cbo'], alpha=0.5)
plt.title('LOC Total vs. Avg CBO')
plt.xlabel('LOC Total')
plt.ylabel('Avg CBO')

# Gráfico de dispersão para DIT
plt.subplot(2, 2, 2)
plt.scatter(df_filtered['loc_total'], df_filtered['avg_dit'], alpha=0.5)
plt.title('LOC Total vs. Avg DIT')
plt.xlabel('LOC Total')
plt.ylabel('Avg DIT')

# Gráfico de dispersão para LCOM
plt.subplot(2, 2, 3)
plt.scatter(df_filtered['loc_total'], df_filtered['avg_lcom'], alpha=0.5)
plt.title('LOC Total vs. Avg LCOM')
plt.xlabel('LOC Total')
plt.ylabel('Avg LCOM')

plt.tight_layout()
plt.show()