In [None]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from neo4j import basic_auth
import json
import pandas as pd
from IPython.display import display
import time
import os
from openai import OpenAI
from IPython.display import display

# Configuração do Neo4J
uri = "neo4j://138.2.240.156:7688"
auth = basic_auth("neo4j", "Password1")
db = "neo4j"

# Conexão ao Neo4j
driver = GraphDatabase.driver(uri, auth=auth)

# Function to check connection and list databases
def test_connection():
    try:
        with driver.session() as session:
            result = session.run("SHOW DATABASES YIELD name")
            print("Connection to Neo4j successful!\nAvailable databases:")
            for record in result:
                print(record['name'])
    except Exception as e:
        print(f"Error connecting to Neo4j: {e}")

def run_query(query):
    with driver.session(database=db) as session:
        return result

def run_query_graph(query):
    with driver.session(database=db) as session:
        result = session.run(query)
        graph = result.graph()
        return GraphWidget(graph=graph)

def run_query_data(query):
    """
    Executes a query against the database and returns a DataFrame.
    Records query response time.
    """
    start_time = time.time()
    with driver.session(database=db) as session:
        result = session.run(query)
        data = [record.data() for record in result]
        end_time = time.time()
        duration_ms = (end_time - start_time) * 1000
        print(f"Response time: {duration_ms:.2f} ms")
        return pd.DataFrame(data)

def run_query_data_graph(query):
    with driver.session(database=db) as session:
        result = session.run(query)
        data = [record.data() for record in result]
        graph = result.graph()
        graph_widget = GraphWidget(graph=graph)
        display(graph_widget)
        return pd.DataFrame(data)

def save_to_csv_incrementally(df, filename):
    """
    Saves a DataFrame to a CSV file, adding to the end if the file already exists.
    """
    df.to_csv(filename, mode='a', header=not pd.io.common.file_exists(filename), index=False)

# Test the initial connection
test_connection()

In [None]:
import pandas as pd
import os

def run_query(query):
    with driver.session(database=db) as session:
        result = session.run(query)
        return [record.data() for record in result]


input_csv = 'no_categorized.csv'  
output_folder = 'output_dependencies/'  


if not os.path.exists(output_folder):
    os.makedirs(output_folder)


ids = pd.read_csv(input_csv)['ID'].tolist()


batch_size = 100000  
current_batch = []   
batch_count = 1     


for idx, artifact_id in enumerate(ids, start=1):
    
    query = f"MATCH (a:Artifact {{id: '{artifact_id}'}})<-[:dependency]-(dependentArtifact) RETURN DISTINCT dependentArtifact.id AS DependencyArtifactID"
    dependencies = run_query(query)
    
    
    if dependencies:
        print(f"ID: {artifact_id} - find dependencies: {len(dependencies)}")
        
        for dep in dependencies:
            current_batch.append({'ArtifactID': artifact_id, 'DependencyID': dep['DependencyArtifactID']})
    else:
        print(f"ID: {artifact_id} - any find dependencies.")


    if len(current_batch) >= batch_size or idx == len(ids):
     
        output_csv = os.path.join(output_folder, f'batch_{batch_count}_dependencies.csv')
        pd.DataFrame(current_batch).to_csv(output_csv, index=False)
        print(f"Lote {batch_count} salvo em: {output_csv}")
        
    
        current_batch = []
        batch_count += 1


In [None]:
import pandas as pd
import os

input_folder = 'output_dependencies/'
output_file = 'consolidated_dependencies.csv'


dataframes = []


csv_files = [f for f in sorted(os.listdir(input_folder)) if f.endswith('.csv')]

total_files = len(csv_files)


for idx, filename in enumerate(csv_files, start=1):
    file_path = os.path.join(input_folder, filename)
    print(f"Lendo arquivo: {file_path}")
    df = pd.read_csv(file_path)
    dataframes.append(df)  
    
    # Barra de progresso a cada 25%
    progress = (idx / total_files) * 100
    if progress >= 25 and progress < 50 and idx % (total_files // 4) == 0:
        print("25% ok...")
    elif progress >= 50 and progress < 75 and idx % (total_files // 2) == 0:
        print("50% ok...")
    elif progress >= 75 and idx % (3 * total_files // 4) == 0:
        print("75% ok...")


combined_df = pd.concat(dataframes, ignore_index=True)


combined_df.to_csv(output_file, index=False)
print(f"File generated successfully: {output_file}")


In [None]:
import pandas as pd

input_csv = 'consolidated_dependencies.csv'
output_csv = 'cleaned_consolidated_dependencies.csv'  


chunk_size = 500000  
chunks = []


for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
    
    chunk['ArtifactID'] = chunk['ArtifactID'].apply(lambda x: ':'.join(x.split(':')[:2]))
    chunk['DependencyID'] = chunk['DependencyID'].apply(lambda x: ':'.join(x.split(':')[:2]))
   
    chunk = chunk.drop_duplicates()
    chunks.append(chunk)


cleaned_df = pd.concat(chunks, ignore_index=True)


cleaned_df.to_csv(output_csv, index=False)
print(f"File generated clean save as: {output_csv}")


In [None]:
import pandas as pd


csv1_file = 'categorized.csv'
csv2_file = 'consolidate_deps_uniquis.csv'
output_file = 'TAGS_noCategorizedsNewVersion.csv'


csv1 = pd.read_csv(csv1_file)
csv2 = pd.read_csv(csv2_file)


id_to_tag = dict(zip(csv1['ID'], csv1['TAG']))

filtered_csv2 = csv2[csv2['DependencyID'].isin(id_to_tag.keys())].copy()

filtered_csv2.loc[:, 'TAG'] = filtered_csv2['DependencyID'].map(id_to_tag)

filtered_csv2.to_csv(output_file, index=False)
print(f"File generated successfully: {output_file}")