In [None]:
import pandas as pd
import json
from pathlib import Path

# Path to the merged dataset file (adjust the path if needed)
dataset_file = Path("combined_github_data.jsonl")

try:
    print(f"Loading dataset from '{dataset_file}' using pd.read_json with engine='python'...")
    # Using engine='python' and compression='infer' in case the file is gzipped.
    df = pd.read_json(dataset_file, lines=True, compression='infer', engine='python')
    
    print("Dataset loaded successfully!")
    print("Number of records:", len(df))
    print("DataFrame shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("\nFirst 5 rows of the dataset:")
    print(df.head())

except Exception as e:
    print("Error loading the dataset with pd.read_json:", e)
    print("Attempting fallback method (line-by-line manual loading)...")
    try:
        # Read the file line by line and load each JSON object individually
        with dataset_file.open("r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f if line.strip()]
        df = pd.DataFrame(data)
        
        print("Fallback: Dataset loaded successfully!")
    except Exception as e2:
        print("Fallback method also failed:", e2)

In [None]:
print("Number of records:", len(df))
print("DataFrame shape:", df.shape)
print("Columns:", df.columns.tolist())

In [None]:
pd.set_option('display.max_columns', None)
df.head(1000)

In [None]:
# Convert Unix timestamp columns (in milliseconds) to readable date format (no time)
timestamp_cols = ['created_at', 'updated_at', 'pushed_at']
df[timestamp_cols] = df[timestamp_cols].apply(lambda col: pd.to_datetime(col, unit='ms').dt.date)

# Convert 'file_count' column to integer
df['file_count'] = df['file_count'].fillna(0).astype(int)

df['description'] = df['description'].fillna('No description')

In [None]:
# List of columns you want to keep (including ones you said to keep, even if optional)
keep_even_if_optional = [
    'github_url', 'repo', 'file_list', 'languages_breakdown',
    'readme_preview', 'sidebar_about_text', 'commit_count_display'
]

core_columns_to_keep = [
    'repo_name', 'owner', 'description', 'stars', 'forks', 'watchers',
    'open_issues_count', 'contributors_count_page1', 'language',
    'created_at', 'updated_at', 'pushed_at',
    'community_health_percentage', 'has_readme',
    'readme_size_bytes', 'release_count_page1', 'open_pulls_count_page1',
    'workflow_count', 'file_count', 'query_date_range'
]

# Combine and check for availability in dataset
all_columns_to_keep = core_columns_to_keep + keep_even_if_optional
final_columns_to_keep = [col for col in all_columns_to_keep if col in df.columns]

# Create cleaned dataset
df_cleaned = df[final_columns_to_keep]

In [None]:
# Reorder columns in your cleaned dataset
ordered_columns = [
    'repo_name', 'owner', 'github_url', 'description',
    'stars', 'forks', 'watchers', 'open_issues_count', 'open_pulls_count_page1',
    'contributors_count_page1', 'release_count_page1',
    'created_at', 'updated_at', 'pushed_at',
    'has_readme', 'community_health_percentage', 'workflow_count', 'readme_size_bytes',
    'language', 'languages_breakdown',
    'file_count', 'file_list',
    'readme_preview', 'commit_count_display',
    'query_date_range'
]

# Only reorder if all columns exist in the DataFrame
ordered_columns = [col for col in ordered_columns if col in df_cleaned.columns]
df_cleaned = df_cleaned[ordered_columns]

In [None]:
df_cleaned

In [None]:
# Save the cleaned file
df_cleaned.to_csv("GitHub_repo_metadata.csv", index=False)