In [None]:
import os
import glob
import psycopg2
import pandas as pd

# Database connection details
DB_NAME = "github_repos"
DB_USER = "postgres"
DB_PASSWORD = "Sphings@19"
DB_HOST = "localhost"
DB_PORT = "5432"

# Directory containing CSV files
RESULTS_FOLDER = "../results"

try:
    # Connect to PostgreSQL
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST,
        port=DB_PORT
    )
    cur = conn.cursor()

    # Get all CSV files in the results folder
    csv_files = glob.glob(os.path.join(RESULTS_FOLDER, "*.csv"))

    # Read and combine all CSV files into a single DataFrame
    all_data = pd.DataFrame()

    for file in csv_files:
        print(f"Reading {file}...")
        df = pd.read_csv(file)

        # Rename CSV columns to match the database
        df.rename(columns={
            'Hash': 'hash',
            'Project ID': 'project_id',
            'Version': 'version',
            'License': 'license',
            'Method Name': 'method_name',
            'File Location': 'file_location',
            'Function Code': 'function_code',
            'Repository URL': 'repository_url',
            'Query Project': 'query_project',
            'Violation': 'violation'
        }, inplace=True)

        all_data = pd.concat([all_data, df], ignore_index=True)

    # Remove duplicates based on (hash, project_id)
    all_data.drop_duplicates(subset=['hash', 'project_id'], inplace=True)

    # Generate unique ID by combining hash and project_id
    all_data['_id'] = all_data['hash'].astype(str) + "_" + all_data['project_id'].astype(str)

    # Convert DataFrame to a list of tuples for batch insert
    records_to_insert = [
        (
            row['_id'], row['hash'], row['project_id'], row['version'], row['license'], row['method_name'],
            row['file_location'], row['function_code'], row['repository_url'], row['query_project'], row['violation']
        ) for _, row in all_data.iterrows()
    ]

    # Insert all records in bulk
    insert_query = """
     INSERT INTO repository_data (
        _id, hash, project_id, version, license, method_name,
        file_location, function_code, repository_url, query_project, violation
    ) VALUES %s
    ON CONFLICT (hash, project_id, version) DO NOTHING;
    """
    
    from psycopg2.extras import execute_values
    execute_values(cur, insert_query, records_to_insert)

    # Commit changes
    conn.commit()
    print(f"Inserted {len(records_to_insert)} new records successfully.")

except Exception as e:
    print("Error:", e)

finally:
    # Close connection
    if conn:
        cur.close()
        conn.close()

Reading ../results/alibaba_jetcache_matches_688066908.csv...
Reading ../results/alibaba_graphlearn-for-pytorch_matches_4133681667.csv...
Reading ../results/apache_commons-proxy_matches_1847669882.csv...
Reading ../results/apache_mnemonic_matches_521025468.csv...
Reading ../results/microsoft_qTESLA-Library_matches_190354588.csv...
Reading ../results/microsoft_DirectML_matches_780991120.csv...
Reading ../results/microsoft_cppgraphqlgen_matches_945452165.csv...
Reading ../results/microsoft_Vipr_matches_1515233357.csv...
Reading ../results/apache_incubator-taverna-common-activities_matches_713742140.csv...
Reading ../results/microsoft_tsiclient_matches_1527534603.csv...
Reading ../results/microsoft_mssql-jdbc_matches_1581267590.csv...
Reading ../results/microsoft_vscode-node-sqlite3_matches_96360377.csv...
Reading ../results/google_deepconsensus_matches_1744743048.csv...
Reading ../results/microsoft_tes-azure_matches_21812812.csv...
Reading ../results/apache_olingo-odata4-js_matches_113078

  df = pd.read_csv(file)


Reading ../results/microsoft_WinDbg-Samples_matches_2277071650.csv...
Reading ../results/alibaba_Elastic-Federated-Learning-Solution_matches_3025173829.csv...
Reading ../results/google_aistplusplus_api_matches_3529731141.csv...
Reading ../results/apache_incubator-myriad_matches_1367267230.csv...
Reading ../results/google_security-research_matches_86167671.csv...
Reading ../results/microsoft_Orb_matches_722390055.csv...
Reading ../results/google_model_search_matches_1418195231.csv...
Reading ../results/google_AndroidForegroundCompat_matches_1237655965.csv...
Reading ../results/google_pyu2f_matches_971930988.csv...
Reading ../results/microsoft_BotFramework-WebChat_matches_908749798.csv...
Reading ../results/alibaba_feathub_matches_2593078593.csv...
Reading ../results/alibaba_compileflow_matches_522022197.csv...
Reading ../results/apache_santuario-cpp_matches_3333900441.csv...
Reading ../results/google_pigweed_matches_225673791.csv...
Reading ../results/facebook_react_matches_3488097456.c

  df = pd.read_csv(file)


Reading ../results/google_end-to-end_matches_187789648.csv...
Reading ../results/google_yggdrasil-decision-forests_matches_97444100.csv...
Reading ../results/apache_cloudstack-docs_matches_942352305.csv...
Reading ../results/apache_empire-db_matches_2301912343.csv...
Reading ../results/alibaba_termd_matches_633176973.csv...
Reading ../results/apache_commons-vfs_matches_1367887685.csv...
Reading ../results/alibaba_table-computing_matches_899517379.csv...
Reading ../results/apache_incubator-taverna-workbench-common-activities_matches_2290116557.csv...
Reading ../results/apache_sanselan_matches_25964034.csv...
Reading ../results/alibaba_alibaba-flink-connectors_matches_89710015.csv...
Reading ../results/microsoft_Service-Fabric-POA_matches_296898052.csv...
Reading ../results/google_mystyle_matches_3072142401.csv...
Reading ../results/microsoft_azure-pipelines-tasks_matches_333741612.csv...
Reading ../results/microsoft_FASTER_matches_828731959.csv...
Reading ../results/google_android-auto-

  df = pd.read_csv(file)


Reading ../results/apache_ftpserver_matches_399210200.csv...
Reading ../results/alibaba_proxima_matches_795685134.csv...
Reading ../results/google_polymorphicDSL_matches_1034979129.csv...
Reading ../results/apache_commons-jci_matches_1010025004.csv...
Reading ../results/alibaba_compileflow-idea-designer_matches_513826780.csv...
Reading ../results/apache_commons-jexl_matches_1064865148.csv...
Reading ../results/microsoft_petridishnn_matches_601363527.csv...
Reading ../results/google_skia-buildbot_matches_1001889334.csv...
Reading ../results/microsoft_lisa_matches_440379610.csv...
Reading ../results/apache_uima-sandbox_matches_2717498932.csv...
Reading ../results/apache_devicemap-browsermap_matches_2105622160.csv...
Reading ../results/microsoft_immersive-reader-sdk_matches_1160178615.csv...
Reading ../results/microsoft_python-sample-vscode-django-tutorial_matches_130249801.csv...
Reading ../results/microsoft_DirectXTex_matches_1451294112.csv...
Reading ../results/microsoft_GraphEngine_ma

  df = pd.read_csv(file)


Inserted 1017474 new records successfully.
