In [1]:
import pandas as pd
import os
from pathlib import Path
import json # Used for potential error diagnostics if needed
import zipfile # Import the zipfile module

# --- Configuration ---
# Define the path to the folder containing the archive files
# (These might be named .jsonl but are actually zip archives)
api_data_folder = Path("github_data_compressed")

# Define the path to the single scraped data JSONL file
scraped_data_file = Path("scraped_data.jsonl")

# Define the name for the final combined output file
output_file = Path("combined_github_data.jsonl")

# --- Step 1: Read and combine all JSONL files from the archives ---

# Look for files named *.jsonl first, as shown in the first image
print(f"Searching for archive files named '*.jsonl' in '{api_data_folder}'...")
all_api_archive_files = list(api_data_folder.glob('*.jsonl'))

# If no '*.jsonl' files are found, try looking for '*.zip' as a fallback
if not all_api_archive_files:
    print(f"  - No '*.jsonl' files found. Searching for '*.zip' instead...")
    all_api_archive_files = list(api_data_folder.glob('*.zip'))

api_dataframes = [] # List to hold DataFrames from each archive

print(f"Found {len(all_api_archive_files)} potential archive files. Processing...")

if not all_api_archive_files:
    print(f"Error: No suitable archive files (.jsonl or .zip) found in '{api_data_folder}'. Exiting.")
    exit()

for archive_file_path in all_api_archive_files: # Loop through found archive files
    print(f"  - Processing '{archive_file_path.name}'...")
    try:
        # Open the archive file (treats it as a zip regardless of name)
        with zipfile.ZipFile(archive_file_path, 'r') as zip_ref:
            # Find the .jsonl file(s) inside the zip archive
            jsonl_files_in_zip = [f for f in zip_ref.namelist() if f.endswith('.jsonl')]

            if not jsonl_files_in_zip:
                print(f"    - Warning: No .jsonl file found inside '{archive_file_path.name}'. Skipping.")
                continue # Skip to the next archive file

            # Determine which jsonl file to use if multiple are found
            if len(jsonl_files_in_zip) > 1:
                # Attempt to find a jsonl file matching the archive name (minus extension)
                base_name = archive_file_path.stem # Gets filename without final extension
                matching_files = [f for f in jsonl_files_in_zip if Path(f).stem == base_name]
                if matching_files:
                    jsonl_filename_inside_zip = matching_files[0]
                    print(f"    - Warning: Multiple .jsonl files found. Using matching name: '{jsonl_filename_inside_zip}'.")
                else:
                    jsonl_filename_inside_zip = jsonl_files_in_zip[0] # Default to first if no match
                    print(f"    - Warning: Multiple .jsonl files found. No clear match. Using the first one: '{jsonl_filename_inside_zip}'.")
            else:
                 jsonl_filename_inside_zip = jsonl_files_in_zip[0] # Only one file found

            # Open the specific .jsonl file from within the zip archive
            with zip_ref.open(jsonl_filename_inside_zip) as jsonl_file:
                # Read the JSONL file directly from the zip archive into a pandas DataFrame
                df = pd.read_json(jsonl_file, lines=True)
                if not df.empty:
                    api_dataframes.append(df)
                    print(f"    - Successfully read '{jsonl_filename_inside_zip}' ({len(df)} records)")
                else:
                    print(f"    - Warning: '{jsonl_filename_inside_zip}' inside '{archive_file_path.name}' is empty or could not be parsed.")

    except zipfile.BadZipFile:
        print(f"  - Error: '{archive_file_path.name}' is not a valid zip file or is corrupted. Skipping.")
    except Exception as e:
        print(f"  - Error processing archive file {archive_file_path}: {e}")
        # Optional: Add more robust error handling here if needed

# Check if any data was successfully read
if not api_dataframes:
    print(f"\nError: Could not read any valid data from the archive files in '{api_data_folder}'. Exiting.")
    exit()

# Concatenate all the individual DataFrames into one
print("\nConcatenating data from all archives...")
api_df = pd.concat(api_dataframes, ignore_index=True)
print(f"Successfully combined data from archives. Shape: {api_df.shape}")

# --- Step 2: Read the scraped data JSONL file ---
print(f"\nReading scraped data file '{scraped_data_file}'...")
try:
    scraped_df = pd.read_json(scraped_data_file, lines=True)
    print(f"Successfully read scraped data. Shape: {scraped_df.shape}")
except FileNotFoundError:
    print(f"Error: Scraped data file '{scraped_data_file}' not found. Exiting.")
    exit()
except Exception as e:
    print(f"Error reading scraped data file {scraped_data_file}: {e}")
    exit()

# --- Step 3: Prepare for Merging ---
api_key_col = 'github_url'
scraped_key_col = 'url'

# Check if the key columns exist before attempting merge
if api_key_col not in api_df.columns:
    print(f"Error: Key column '{api_key_col}' not found in the combined API data. Available columns: {api_df.columns.tolist()}")
    exit()
if scraped_key_col not in scraped_df.columns:
     print(f"Error: Key column '{scraped_key_col}' not found in the scraped data. Available columns: {scraped_df.columns.tolist()}")
     exit()

# Rename the key column in the scraped data to match the API data for merging
print(f"\nRenaming '{scraped_key_col}' to '{api_key_col}' in scraped data for merging.")
scraped_df.rename(columns={scraped_key_col: api_key_col}, inplace=True)

# --- Step 4: Merge the DataFrames ---
print(f"Merging the two datasets using '{api_key_col}' as the key...")

# Perform a 'left' merge: Keep all rows from api_df (the combined API data)
# and add matching columns from scraped_df based on the 'github_url'.
# If a URL from api_df doesn't have a match in scraped_df, the new columns
# for that row will be filled with NaN (Not a Number).
merged_df = pd.merge(api_df, scraped_df, on=api_key_col, how='left')

print(f"Merge complete. Shape of final DataFrame: {merged_df.shape}")

# --- Step 5: Save the Combined Data ---
print(f"\nSaving the merged data to '{output_file}'...")
try:
    # Save the merged DataFrame to a new JSONL file
    # orient='records' makes each row a JSON object
    # lines=True ensures it's saved in the JSON Lines format
    # force_ascii=False helps prevent unicode characters from being escaped (optional)
    merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Successfully saved combined data to '{output_file}'.")
except Exception as e:
    print(f"Error saving the output file: {e}")

print("\nScript finished.")

Searching for archive files named '*.jsonl' in 'github_data_compressed'...
  - No '*.jsonl' files found. Searching for '*.zip' instead...
Found 0 potential archive files. Processing...
Error: No suitable archive files (.jsonl or .zip) found in 'github_data_compressed'. Exiting.

Error: Could not read any valid data from the archive files in 'github_data_compressed'. Exiting.

Concatenating data from all archives...


ValueError: No objects to concatenate

In [1]:
import pandas as pd
from pathlib import Path

# --- Configuration ---
api_data_folder = Path("github_data_compressed")  # Folder with subfolders containing .jsonl files
scraped_data_file = Path("scraped_data.jsonl")    # File with scraped data
output_file = Path("combined_github_data.jsonl")  # Final output file

# --- Step 1: Read and combine all JSONL files from the folders ---
print(f"Searching for folders inside '{api_data_folder}'...")

# Each subfolder is expected to contain a .jsonl file with the same name as the folder
all_api_folders = [f for f in api_data_folder.iterdir() if f.is_dir()]
api_dataframes = []

print(f"Found {len(all_api_folders)} folders. Processing...")

if not all_api_folders:
    print(f"Error: No folders found in '{api_data_folder}'. Exiting.")
    exit()

for folder_path in all_api_folders:
    print(f"  - Processing folder: '{folder_path.name}'")

    # Construct the expected .jsonl file path inside this folder
    inner_jsonl_file = folder_path / f"{folder_path.name}"

    if inner_jsonl_file.exists():
        try:
            df = pd.read_json(inner_jsonl_file, lines=True)
            if not df.empty:
                api_dataframes.append(df)
                print(f"    - Successfully read {len(df)} records from '{inner_jsonl_file.name}'")
            else:
                print(f"    - Warning: '{inner_jsonl_file.name}' is empty.")
        except Exception as e:
            print(f"    - Error reading '{inner_jsonl_file.name}': {e}")
    else:
        print(f"    - Warning: File '{inner_jsonl_file.name}' not found in '{folder_path.name}'. Skipping.")

if not api_dataframes:
    print(f"\nError: No valid data found in any folders inside '{api_data_folder}'. Exiting.")
    exit()

# Concatenate all the individual DataFrames into one
print("\nConcatenating data from all folders...")
api_df = pd.concat(api_dataframes, ignore_index=True)
print(f"Successfully combined data. Shape: {api_df.shape}")

# --- Step 2: Read the scraped data JSONL file ---
print(f"\nReading scraped data file '{scraped_data_file}'...")
try:
    scraped_df = pd.read_json(scraped_data_file, lines=True)
    print(f"Successfully read scraped data. Shape: {scraped_df.shape}")
except FileNotFoundError:
    print(f"Error: Scraped data file '{scraped_data_file}' not found. Exiting.")
    exit()
except Exception as e:
    print(f"Error reading scraped data file {scraped_data_file}: {e}")
    exit()

# --- Step 3: Prepare for Merging ---
api_key_col = 'github_url'
scraped_key_col = 'url'

if api_key_col not in api_df.columns:
    print(f"Error: Key column '{api_key_col}' not found in the API data. Available columns: {api_df.columns.tolist()}")
    exit()
if scraped_key_col not in scraped_df.columns:
    print(f"Error: Key column '{scraped_key_col}' not found in the scraped data. Available columns: {scraped_df.columns.tolist()}")
    exit()

# Rename for consistency
print(f"\nRenaming '{scraped_key_col}' to '{api_key_col}' in scraped data for merging.")
scraped_df.rename(columns={scraped_key_col: api_key_col}, inplace=True)

# --- Step 4: Merge ---
print(f"Merging both datasets on '{api_key_col}'...")
merged_df = pd.merge(api_df, scraped_df, on=api_key_col, how='left')
print(f"Merge complete. Shape of merged DataFrame: {merged_df.shape}")

# --- Step 5: Save the result ---
print(f"\nSaving merged data to '{output_file}'...")
try:
    merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Successfully saved to '{output_file}'")
except Exception as e:
    print(f"Error saving the output file: {e}")

print("\nScript finished.")


Searching for folders inside 'github_data_compressed'...
Found 0 folders. Processing...
Error: No folders found in 'github_data_compressed'. Exiting.

Error: No valid data found in any folders inside 'github_data_compressed'. Exiting.

Concatenating data from all folders...


ValueError: No objects to concatenate

In [1]:
import pandas as pd
from pathlib import Path

# --- Configuration ---
api_data_folder = Path("github_data_compressed")
scraped_data_file = Path("scraped_data.jsonl")
output_file = Path("combined_github_data.jsonl")

# --- Step 1: Read and combine all JSONL files from each subfolder ---
print(f"Searching for subfolders in '{api_data_folder}'...")
all_api_folders = [f for f in api_data_folder.iterdir() if f.is_dir()]
api_dataframes = []

print(f"Found {len(all_api_folders)} subfolders. Processing...")

for folder_path in all_api_folders:
    print(f"  - Processing folder '{folder_path.name}'...")
    
    jsonl_file_path = folder_path / f"{folder_path.name}.jsonl"

    if jsonl_file_path.exists():
        try:
            df = pd.read_json(jsonl_file_path, lines=True)
            if not df.empty:
                api_dataframes.append(df)
                print(f"    - Successfully read {len(df)} records from '{jsonl_file_path.name}'")
            else:
                print(f"    - Warning: File '{jsonl_file_path.name}' is empty.")
        except Exception as e:
            print(f"    - Error reading '{jsonl_file_path.name}': {e}")
    else:
        print(f"    - Warning: File '{jsonl_file_path.name}' not found in folder '{folder_path.name}'. Skipping.")

if not api_dataframes:
    print(f"\nError: No dataframes were loaded. Exiting.")
    exit()

print("\nConcatenating data from all subfolders...")
api_df = pd.concat(api_dataframes, ignore_index=True)
print(f"Successfully combined data. Shape: {api_df.shape}")

# --- Step 2: Read the scraped data JSONL file ---
print(f"\nReading scraped data file '{scraped_data_file}'...")
try:
    scraped_df = pd.read_json(scraped_data_file, lines=True)
    print(f"Successfully read scraped data. Shape: {scraped_df.shape}")
except FileNotFoundError:
    print(f"Error: Scraped data file '{scraped_data_file}' not found. Exiting.")
    exit()
except Exception as e:
    print(f"Error reading scraped data file: {e}")
    exit()

# --- Step 3: Prepare for Merging ---
api_key_col = 'github_url'
scraped_key_col = 'url'

if api_key_col not in api_df.columns:
    print(f"Error: Key column '{api_key_col}' not found in the API data. Available columns: {api_df.columns.tolist()}")
    exit()
if scraped_key_col not in scraped_df.columns:
    print(f"Error: Key column '{scraped_key_col}' not found in the scraped data. Available columns: {scraped_df.columns.tolist()}")
    exit()

print(f"\nRenaming '{scraped_key_col}' to '{api_key_col}' for merging...")
scraped_df.rename(columns={scraped_key_col: api_key_col}, inplace=True)

# --- Step 4: Merge ---
print(f"Merging datasets using '{api_key_col}'...")
merged_df = pd.merge(api_df, scraped_df, on=api_key_col, how='left')
print(f"Merge complete. Final shape: {merged_df.shape}")

# --- Step 5: Save result ---
print(f"\nSaving merged data to '{output_file}'...")
try:
    merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Data saved to '{output_file}' successfully.")
except Exception as e:
    print(f"Error saving file: {e}")

print("\nScript finished.")


Searching for subfolders in 'github_data_compressed'...
Found 0 subfolders. Processing...

Error: No dataframes were loaded. Exiting.

Concatenating data from all subfolders...


ValueError: No objects to concatenate

In [1]:
print("hi")

hi


In [1]:
import pandas as pd
from pathlib import Path
import zipfile

# --- Configuration ---
# Define the path to the folder containing the archive files.
# Some files might be plain JSONL and others might be zipped.
api_data_folder = Path("github_data_compressed")

# Define the path to the single scraped data JSONL file.
scraped_data_file = Path("scraped_data.jsonl")

# Define the name for the final combined output file.
output_file = Path("combined_github_data.jsonl")

# --- Step 1: Read and Combine All API Data Files ---
# First, look for plain JSONL files.
print(f"Searching for archive files named '*.jsonl' in '{api_data_folder}'...")
all_api_archive_files = list(api_data_folder.glob('*.jsonl'))

api_dataframes = []  # List to hold DataFrames from each source

if all_api_archive_files:
    print(f"Found {len(all_api_archive_files)} JSONL file(s). Processing them directly...")
    for jsonl_file_path in all_api_archive_files:
        print(f"  - Reading {jsonl_file_path.name}...")
        try:
            df = pd.read_json(jsonl_file_path, lines=True)
            if not df.empty:
                api_dataframes.append(df)
                print(f"    - Successfully read {len(df)} records from {jsonl_file_path.name}.")
            else:
                print(f"    - Warning: {jsonl_file_path.name} is empty or could not be parsed.")
        except Exception as e:
            print(f"    - Error reading file {jsonl_file_path.name}: {e}")
else:
    # If no plain JSONL files are found, try looking for zip archives.
    print("  - No '*.jsonl' files found. Searching for '*.zip' instead...")
    zip_files = list(api_data_folder.glob('*.zip'))
    print(f"Found {len(zip_files)} zip archive file(s). Processing them...")
    for archive_file_path in zip_files:
        print(f"  - Processing '{archive_file_path.name}'...")
        try:
            with zipfile.ZipFile(archive_file_path, 'r') as zip_ref:
                # Find JSONL files within the zip archive.
                jsonl_files_in_zip = [f for f in zip_ref.namelist() if f.endswith('.jsonl')]
                if not jsonl_files_in_zip:
                    print(f"    - Warning: No JSONL file found inside '{archive_file_path.name}'. Skipping.")
                    continue

                # If multiple JSONL files are found, try to match using the archive name.
                if len(jsonl_files_in_zip) > 1:
                    base_name = archive_file_path.stem  # Get filename without extension.
                    matching_files = [f for f in jsonl_files_in_zip if Path(f).stem == base_name]
                    if matching_files:
                        jsonl_filename_inside_zip = matching_files[0]
                        print(f"    - Warning: Multiple JSONL files found. Using matching name: '{jsonl_filename_inside_zip}'.")
                    else:
                        jsonl_filename_inside_zip = jsonl_files_in_zip[0]
                        print(f"    - Warning: Multiple JSONL files found. No clear match. Using the first one: '{jsonl_filename_inside_zip}'.")
                else:
                    jsonl_filename_inside_zip = jsonl_files_in_zip[0]

                # Open and read the JSONL file from within the zip archive.
                with zip_ref.open(jsonl_filename_inside_zip) as jsonl_file:
                    df = pd.read_json(jsonl_file, lines=True)
                    if not df.empty:
                        api_dataframes.append(df)
                        print(f"    - Successfully read '{jsonl_filename_inside_zip}' ({len(df)} records).")
                    else:
                        print(f"    - Warning: '{jsonl_filename_inside_zip}' inside '{archive_file_path.name}' is empty or could not be parsed.")
        except zipfile.BadZipFile:
            print(f"  - Error: '{archive_file_path.name}' is not a valid zip file or is corrupted. Skipping.")
        except Exception as e:
            print(f"  - Error processing archive file {archive_file_path}: {e}")

if not api_dataframes:
    print(f"\nError: Could not read any valid data from files in '{api_data_folder}'. Exiting.")
    exit()

print("\nConcatenating data from all sources...")
api_df = pd.concat(api_dataframes, ignore_index=True)
print(f"Successfully combined data. Shape: {api_df.shape}")

# --- Step 2: Read the Scraped Data JSONL File ---
print(f"\nReading scraped data file '{scraped_data_file}'...")
try:
    scraped_df = pd.read_json(scraped_data_file, lines=True)
    print(f"Successfully read scraped data. Shape: {scraped_df.shape}")
except FileNotFoundError:
    print(f"Error: Scraped data file '{scraped_data_file}' not found. Exiting.")
    exit()
except Exception as e:
    print(f"Error reading scraped data file {scraped_data_file}: {e}")
    exit()

# --- Step 3: Prepare for Merging ---
api_key_col = 'github_url'
scraped_key_col = 'url'

# Confirm that both key columns exist.
if api_key_col not in api_df.columns:
    print(f"Error: Key column '{api_key_col}' not found in API data. Available columns: {api_df.columns.tolist()}")
    exit()
if scraped_key_col not in scraped_df.columns:
    print(f"Error: Key column '{scraped_key_col}' not found in scraped data. Available columns: {scraped_df.columns.tolist()}")
    exit()

# Rename the key column in the scraped data to match the API data.
print(f"\nRenaming '{scraped_key_col}' to '{api_key_col}' in scraped data for merging.")
scraped_df.rename(columns={scraped_key_col: api_key_col}, inplace=True)

# --- Step 4: Merge the DataFrames ---
print(f"Merging the datasets using '{api_key_col}' as the key...")
merged_df = pd.merge(api_df, scraped_df, on=api_key_col, how='left')
print(f"Merge complete. Shape of final DataFrame: {merged_df.shape}")

# --- Step 5: Save the Combined Data ---
print(f"\nSaving the merged data to '{output_file}'...")
try:
    merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Successfully saved combined data to '{output_file}'.")
except Exception as e:
    print(f"Error saving the output file: {e}")

print("\nScript finished.")

Searching for archive files named '*.jsonl' in 'github_data_compressed'...
  - No '*.jsonl' files found. Searching for '*.zip' instead...
Found 0 zip archive file(s). Processing them...

Error: Could not read any valid data from files in 'github_data_compressed'. Exiting.

Concatenating data from all sources...


ValueError: No objects to concatenate

In [1]:
import pandas as pd
import os
from pathlib import Path
import zipfile

# --- Configuration ---
# The folder where your API data files are stored.
api_data_folder = Path("github_data_compressed")
# The file that has your scraped data.
scraped_data_file = Path("scraped_data.jsonl")
# The output file where merged data is saved.
output_file = Path("combined_github_data.jsonl")

# --- Debug Info: Print current directory and folder contents ---
print("Current working directory:", os.getcwd())
if not api_data_folder.exists():
    print(f"Error: The folder '{api_data_folder}' does not exist. Check your file path.")
    exit()
else:
    print(f"Files in '{api_data_folder}': {[f.name for f in api_data_folder.iterdir()]}")

# --- Step 1: Read and Combine All API Data Files ---
# First, try to find plain JSONL files.
print(f"\nSearching for '*.jsonl' files in '{api_data_folder}'...")
all_api_archive_files = list(api_data_folder.glob('*.jsonl'))
api_dataframes = []  # This list will hold your DataFrames.

if all_api_archive_files:
    print(f"Found {len(all_api_archive_files)} JSONL file(s).")
    for jsonl_file_path in all_api_archive_files:
        print(f"  - Reading file: {jsonl_file_path.name}")
        try:
            df = pd.read_json(jsonl_file_path, lines=True)
            print(f"    - {jsonl_file_path.name} has {len(df)} record(s).")
            if not df.empty:
                api_dataframes.append(df)
            else:
                print(f"    - Warning: {jsonl_file_path.name} is empty.")
        except Exception as e:
            print(f"    - Error reading {jsonl_file_path.name}: {e}")
else:
    # If no JSONL files are found, then try zip files.
    print("No plain JSONL files found. Searching for '*.zip' files instead...")
    zip_files = list(api_data_folder.glob('*.zip'))
    print(f"Found {len(zip_files)} zip file(s).")
    for archive_file_path in zip_files:
        print(f"  - Processing ZIP file: {archive_file_path.name}")
        try:
            with zipfile.ZipFile(archive_file_path, 'r') as zip_ref:
                # Look for JSONL files inside the ZIP.
                jsonl_files_in_zip = [f for f in zip_ref.namelist() if f.endswith('.jsonl')]
                if not jsonl_files_in_zip:
                    print(f"    - Warning: No JSONL file found in {archive_file_path.name}.")
                    continue
                # If there are multiple JSONL files, try to pick one that matches the archive name.
                if len(jsonl_files_in_zip) > 1:
                    base_name = archive_file_path.stem
                    matching_files = [f for f in jsonl_files_in_zip if Path(f).stem == base_name]
                    if matching_files:
                        jsonl_filename_inside_zip = matching_files[0]
                        print(f"    - Multiple JSONL files found. Using matching file: {jsonl_filename_inside_zip}")
                    else:
                        jsonl_filename_inside_zip = jsonl_files_in_zip[0]
                        print(f"    - Multiple JSONL files found. Using first file: {jsonl_filename_inside_zip}")
                else:
                    jsonl_filename_inside_zip = jsonl_files_in_zip[0]
                    print(f"    - Found JSONL file: {jsonl_filename_inside_zip}")
                with zip_ref.open(jsonl_filename_inside_zip) as jsonl_file:
                    df = pd.read_json(jsonl_file, lines=True)
                    print(f"    - {jsonl_filename_inside_zip} has {len(df)} record(s).")
                    if not df.empty:
                        api_dataframes.append(df)
                    else:
                        print(f"    - Warning: {jsonl_filename_inside_zip} is empty or could not be parsed.")
        except zipfile.BadZipFile:
            print(f"  - Error: {archive_file_path.name} is not a valid ZIP file or is corrupted.")
        except Exception as e:
            print(f"  - Error processing {archive_file_path.name}: {e}")

# If no valid data was read, show folder contents to help debug.
if not api_dataframes:
    print(f"\nError: No valid API data was read from '{api_data_folder}'.")
    print("Check that the folder contains JSONL or ZIP files and they are not empty.")
    exit()

# --- Combine All API DataFrames ---
print("\nConcatenating data from all files...")
api_df = pd.concat(api_dataframes, ignore_index=True)
print(f"Combined API data shape: {api_df.shape}")

# --- Step 2: Read the Scraped Data JSONL File ---
print(f"\nReading scraped data from '{scraped_data_file}'...")
try:
    scraped_df = pd.read_json(scraped_data_file, lines=True)
    print(f"Scraped data has {len(scraped_df)} record(s). Shape: {scraped_df.shape}")
except FileNotFoundError:
    print(f"Error: Scraped data file '{scraped_data_file}' not found.")
    exit()
except Exception as e:
    print(f"Error reading scraped data file: {e}")
    exit()

# --- Step 3: Prepare for Merging ---
api_key_col = 'github_url'
scraped_key_col = 'url'

if api_key_col not in api_df.columns:
    print(f"Error: '{api_key_col}' not found in API data. Available columns: {api_df.columns.tolist()}")
    exit()
if scraped_key_col not in scraped_df.columns:
    print(f"Error: '{scraped_key_col}' not found in scraped data. Available columns: {scraped_df.columns.tolist()}")
    exit()

print(f"\nRenaming '{scraped_key_col}' to '{api_key_col}' in scraped data for merge.")
scraped_df.rename(columns={scraped_key_col: api_key_col}, inplace=True)

# --- Step 4: Merge DataFrames ---
print(f"Merging data on key column '{api_key_col}'...")
merged_df = pd.merge(api_df, scraped_df, on=api_key_col, how='left')
print(f"Merged data shape: {merged_df.shape}")

# --- Step 5: Save the Merged Data ---
print(f"\nSaving merged data to '{output_file}'...")
try:
    merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Merged data saved successfully to '{output_file}'.")
except Exception as e:
    print(f"Error saving merged data: {e}")

print("\nScript finished.")


Current working directory: C:\Users\PROBOOK\Classic Isaac\DataGospel\GitHub
Files in 'github_data_compressed': ['repos_2021-01-01_to_2021-03-31.jsonl.gz', 'repos_2021-04-01_to_2021-06-30.jsonl.gz', 'repos_2021-07-01_to_2021-09-30.jsonl.gz', 'repos_2021-10-01_to_2021-12-31.jsonl.gz', 'repos_2022-01-01_to_2022-03-31.jsonl.gz', 'repos_2022-04-01_to_2022-06-30.jsonl.gz', 'repos_2022-07-01_to_2022-09-30.jsonl.gz', 'repos_2022-10-01_to_2022-12-31.jsonl.gz', 'repos_2023-01-01_to_2023-03-31.jsonl.gz', 'repos_2023-04-01_to_2023-06-30.jsonl.gz', 'repos_2023-07-01_to_2023-09-30.jsonl.gz', 'repos_2023-10-01_to_2023-12-31.jsonl.gz', 'repos_2024-01-01_to_2024-03-31.jsonl.gz', 'repos_2024-04-01_to_2024-06-30.jsonl.gz', 'repos_2024-07-01_to_2024-09-30.jsonl.gz']

Searching for '*.jsonl' files in 'github_data_compressed'...
No plain JSONL files found. Searching for '*.zip' files instead...
Found 0 zip file(s).

Error: No valid API data was read from 'github_data_compressed'.
Check that the folder conta

ValueError: No objects to concatenate

In [1]:
import pandas as pd
from pathlib import Path
import zipfile
import os

# --- Configuration ---
# Folder containing the API data files.
api_data_folder = Path("github_data_compressed")
# File containing the scraped data.
scraped_data_file = Path("scraped_data.jsonl")  # Change to scraped_data.jsonl.gz if needed
# Output file for the merged data.
output_file = Path("combined_github_data.jsonl")

# --- Debug Info: Print current directory and list folder contents ---
print("Current working directory:", os.getcwd())
if not api_data_folder.exists():
    print(f"Error: The folder '{api_data_folder}' does not exist. Check your file path.")
    exit()
else:
    print(f"Files in '{api_data_folder}': {[f.name for f in api_data_folder.iterdir()]}")

# --- Step 1: Read and Combine All API Data Files ---
# Look for both plain JSONL and gzipped JSONL files.
print(f"\nSearching for '*.jsonl' and '*.jsonl.gz' files in '{api_data_folder}'...")
jsonl_files = list(api_data_folder.glob('*.jsonl'))
jsonl_gz_files = list(api_data_folder.glob('*.jsonl.gz'))
all_api_archive_files = jsonl_files + jsonl_gz_files

api_dataframes = []  # List to store DataFrames

if all_api_archive_files:
    print(f"Found {len(all_api_archive_files)} file(s). Processing each file...")
    for file_path in all_api_archive_files:
        print(f"  - Reading file: {file_path.name}")
        try:
            # Compression will be inferred from the file extension.
            df = pd.read_json(file_path, lines=True, compression='infer')
            print(f"    - {file_path.name} has {len(df)} record(s).")
            if not df.empty:
                api_dataframes.append(df)
            else:
                print(f"    - Warning: {file_path.name} is empty.")
        except Exception as e:
            print(f"    - Error reading {file_path.name}: {e}")
else:
    # If no JSONL or gzipped JSONL files are found, attempt to search for ZIP files.
    print("No '*.jsonl' or '*.jsonl.gz' files found. Searching for '*.zip' files instead...")
    zip_files = list(api_data_folder.glob('*.zip'))
    print(f"Found {len(zip_files)} zip file(s).")
    for archive_file_path in zip_files:
        print(f"  - Processing ZIP file: {archive_file_path.name}")
        try:
            with zipfile.ZipFile(archive_file_path, 'r') as zip_ref:
                jsonl_files_in_zip = [f for f in zip_ref.namelist() if f.endswith('.jsonl')]
                if not jsonl_files_in_zip:
                    print(f"    - Warning: No JSONL file found in {archive_file_path.name}.")
                    continue
                if len(jsonl_files_in_zip) > 1:
                    base_name = archive_file_path.stem
                    matching_files = [f for f in jsonl_files_in_zip if Path(f).stem == base_name]
                    if matching_files:
                        jsonl_filename_inside_zip = matching_files[0]
                        print(f"    - Multiple JSONL files found. Using matching file: {jsonl_filename_inside_zip}")
                    else:
                        jsonl_filename_inside_zip = jsonl_files_in_zip[0]
                        print(f"    - Multiple JSONL files found. Using first file: {jsonl_filename_inside_zip}")
                else:
                    jsonl_filename_inside_zip = jsonl_files_in_zip[0]
                    print(f"    - Found JSONL file: {jsonl_filename_inside_zip}")
                with zip_ref.open(jsonl_filename_inside_zip) as jsonl_file:
                    df = pd.read_json(jsonl_file, lines=True)
                    print(f"    - {jsonl_filename_inside_zip} has {len(df)} record(s).")
                    if not df.empty:
                        api_dataframes.append(df)
                    else:
                        print(f"    - Warning: {jsonl_filename_inside_zip} is empty or could not be parsed.")
        except zipfile.BadZipFile:
            print(f"  - Error: {archive_file_path.name} is not a valid ZIP file or is corrupted.")
        except Exception as e:
            print(f"  - Error processing {archive_file_path.name}: {e}")

if not api_dataframes:
    print(f"\nError: No valid API data was read from '{api_data_folder}'.")
    print("Check that the folder contains .jsonl, .jsonl.gz or ZIP files and that they are not empty.")
    exit()

# --- Combine All API DataFrames ---
print("\nConcatenating data from all files...")
api_df = pd.concat(api_dataframes, ignore_index=True)
print(f"Combined API data shape: {api_df.shape}")

# --- Step 2: Read the Scraped Data JSONL File ---
print(f"\nReading scraped data from '{scraped_data_file}'...")
try:
    # Use compression='infer' in case the scraped file is gzipped.
    scraped_df = pd.read_json(scraped_data_file, lines=True, compression='infer')
    print(f"Scraped data has {len(scraped_df)} record(s). Shape: {scraped_df.shape}")
except FileNotFoundError:
    print(f"Error: Scraped data file '{scraped_data_file}' not found. Exiting.")
    exit()
except Exception as e:
    print(f"Error reading scraped data file: {e}")
    exit()

# --- Step 3: Prepare for Merging ---
api_key_col = 'github_url'
scraped_key_col = 'url'

if api_key_col not in api_df.columns:
    print(f"Error: Key column '{api_key_col}' not found in API data. Available columns: {api_df.columns.tolist()}")
    exit()
if scraped_key_col not in scraped_df.columns:
    print(f"Error: Key column '{scraped_key_col}' not found in scraped data. Available columns: {scraped_df.columns.tolist()}")
    exit()

print(f"\nRenaming '{scraped_key_col}' to '{api_key_col}' in scraped data for merging.")
scraped_df.rename(columns={scraped_key_col: api_key_col}, inplace=True)

# --- Step 4: Merge DataFrames ---
print(f"Merging data on key column '{api_key_col}'...")
merged_df = pd.merge(api_df, scraped_df, on=api_key_col, how='left')
print(f"Merged data shape: {merged_df.shape}")

# --- Step 5: Save the Merged Data ---
print(f"\nSaving merged data to '{output_file}'...")
try:
    merged_df.to_json(output_file, orient='records', lines=True, force_ascii=False)
    print(f"Merged data saved successfully to '{output_file}'.")
except Exception as e:
    print(f"Error saving merged data: {e}")

print("\nScript finished.")


Current working directory: C:\Users\PROBOOK\Classic Isaac\DataGospel\GitHub
Files in 'github_data_compressed': ['repos_2021-01-01_to_2021-03-31.jsonl.gz', 'repos_2021-04-01_to_2021-06-30.jsonl.gz', 'repos_2021-07-01_to_2021-09-30.jsonl.gz', 'repos_2021-10-01_to_2021-12-31.jsonl.gz', 'repos_2022-01-01_to_2022-03-31.jsonl.gz', 'repos_2022-04-01_to_2022-06-30.jsonl.gz', 'repos_2022-07-01_to_2022-09-30.jsonl.gz', 'repos_2022-10-01_to_2022-12-31.jsonl.gz', 'repos_2023-01-01_to_2023-03-31.jsonl.gz', 'repos_2023-04-01_to_2023-06-30.jsonl.gz', 'repos_2023-07-01_to_2023-09-30.jsonl.gz', 'repos_2023-10-01_to_2023-12-31.jsonl.gz', 'repos_2024-01-01_to_2024-03-31.jsonl.gz', 'repos_2024-04-01_to_2024-06-30.jsonl.gz', 'repos_2024-07-01_to_2024-09-30.jsonl.gz']

Searching for '*.jsonl' and '*.jsonl.gz' files in 'github_data_compressed'...
Found 15 file(s). Processing each file...
  - Reading file: repos_2021-01-01_to_2021-03-31.jsonl.gz
    - repos_2021-01-01_to_2021-03-31.jsonl.gz has 1000 record(s)