In [1]:
import pandas as pd
import itertools
import json

In [2]:
def read_n_lines(file_path, n_lines=1000, columns=None):
    """
    Read the first n lines from a JSONL file into a pandas DataFrame,
    optionally selecting only specific columns.

    Parameters:
    file_path (str): Path to the JSONL file
    n_lines (int): Number of lines to read (default: 1000)
    columns (list): List of column names to include (default: None, includes all columns)

    Returns:
    pandas.DataFrame: DataFrame containing the first n lines of JSONL data
    """
    # Initialize list to store the selected data
    selected_data = []

    with open(file_path, "r") as file:
        # Get first n lines
        for line in itertools.islice(file, n_lines):
            # Parse each line as JSON
            full_record = json.loads(line)

            if columns:
                # Keep only the specified columns
                selected_record = {col: full_record.get(col) for col in columns}
                selected_data.append(selected_record)
            else:
                selected_data.append(full_record)

    # Create DataFrame from the selected data
    df = pd.DataFrame(selected_data)

    return df

In [3]:
def read_jsonl_columns(file_path, columns=None):
    """
    Read all lines from a JSONL file into a pandas DataFrame,
    optionally selecting only specific columns.

    Parameters:
    file_path (str): Path to the JSONL file
    columns (list): List of column names to include (default: None, includes all columns)

    Returns:
    pandas.DataFrame: DataFrame containing all lines with selected columns
    """
    selected_data = []

    with open(file_path, "r") as file:
        for line in file:  # Simply iterate through all lines
            try:
                full_record = json.loads(line)
                if columns:
                    # Keep only the specified columns
                    selected_record = {col: full_record.get(col) for col in columns}
                    selected_data.append(selected_record)
                else:
                    selected_data.append(full_record)
            except json.JSONDecodeError:
                continue  # Skip malformed JSON lines

    return pd.DataFrame(selected_data)

In [4]:
import pandas as pd
import json
import os


def process_full_jsonl(
    file_path, output_csv, chunk_size=10000, columns=None, resume_after_id=None
):
    """
    Process a JSONL file in chunks, resuming from a specific ID.
    """
    total_lines_processed = 0
    chunk_number = 0
    found_resume_id = resume_after_id is None
    selected_data = []

    # Check if file exists to determine if we need headers
    write_headers = not os.path.exists(output_csv)

    with open(file_path, "r") as file:
        for line in file:
            try:
                full_record = json.loads(line)

                # Check if we've found the resume ID
                if resume_after_id and full_record.get("id") == resume_after_id:
                    found_resume_id = True
                    continue  # Skip the line with the resume ID

                # Only process if resume ID is found
                if found_resume_id:
                    selected_record = (
                        {col: full_record.get(col) for col in columns}
                        if columns
                        else full_record
                    )

                    selected_data.append(selected_record)
                    total_lines_processed += 1

                    if len(selected_data) >= chunk_size:
                        # Save chunk to CSV
                        df_chunk = pd.DataFrame(selected_data)
                        df_chunk.to_csv(
                            output_csv, mode="a", header=write_headers, index=False
                        )

                        print(
                            f"Processed chunk {chunk_number + 1}: {total_lines_processed} lines"
                        )

                        # Reset for next chunk
                        selected_data = []
                        chunk_number += 1
                        write_headers = False  # Only write headers once

            except json.JSONDecodeError:
                continue

        # Process any remaining data
        if selected_data:
            df_chunk = pd.DataFrame(selected_data)
            df_chunk.to_csv(output_csv, mode="a", header=write_headers, index=False)
            total_lines_processed += len(selected_data)
            print(f"Final chunk processed. Total lines: {total_lines_processed}")

    return total_lines_processed

RS_2020-05 has about 28 million rows

RC_2020-05 has about 190 million rows

In [5]:
file_path_posts = "RS_2020-05"
file_path_comments = "RC_2020-05"
posts = process_full_jsonl(
    file_path_posts,
    "posts.csv",
    chunk_size=1000000,
    # n_lines=1500000,
    columns=[
        "created_utc",
        "id",
        "name",
        "title",
        "selftext",
        "subreddit",
        "score",
        "upvote_ratio",
        "num_comments",
        "archived",
        "author",
        "distinguished",
        "media",
    ],
)
"""comments = process_full_jsonl(
    file_path_comments,
    "comments.csv",
    chunk_size=1000000,
    columns=[
        "created_utc",
        "id",
        "body",
        "score",
        "controversiality",
        "author",
        "parent_id",
    ],
)"""

Processed chunk 1: 1000000 lines
Processed chunk 2: 2000000 lines
Processed chunk 3: 3000000 lines
Processed chunk 4: 4000000 lines
Processed chunk 5: 5000000 lines
Processed chunk 6: 6000000 lines
Processed chunk 7: 7000000 lines
Processed chunk 8: 8000000 lines
Processed chunk 9: 9000000 lines
Processed chunk 10: 10000000 lines
Processed chunk 11: 11000000 lines
Processed chunk 12: 12000000 lines
Processed chunk 13: 13000000 lines
Processed chunk 14: 14000000 lines
Processed chunk 15: 15000000 lines
Processed chunk 16: 16000000 lines
Processed chunk 17: 17000000 lines
Processed chunk 18: 18000000 lines
Processed chunk 19: 19000000 lines
Processed chunk 20: 20000000 lines
Processed chunk 21: 21000000 lines
Processed chunk 22: 22000000 lines
Processed chunk 23: 23000000 lines
Processed chunk 24: 24000000 lines
Processed chunk 25: 25000000 lines
Processed chunk 26: 26000000 lines
Processed chunk 27: 27000000 lines
Processed chunk 28: 28000000 lines
Final chunk processed. Total lines: 28

'comments = process_full_jsonl(\n    file_path_comments,\n    "comments.csv",\n    chunk_size=1000000,\n    columns=[\n        "created_utc",\n        "id",\n        "body",\n        "score",\n        "controversiality",\n        "author",\n        "parent_id",\n    ],\n)'

In [11]:
# Count unique lines in your new CSV
# Test first 10000 lines for duplicates
new_df = pd.read_csv("comments.csv", nrows=1000000)
print(f"Total lines in new file: {len(new_df)}")
print(f"Unique lines in new file: {new_df.drop_duplicates().shape[0]}")

Total lines in new file: 1000000
Unique lines in new file: 1000000


In [None]:
posts = pd.read_csv("posts.csv")
len(posts)

In [5]:
posts.tail()

Unnamed: 0,created_utc,id,title,selftext,subreddit,score,upvote_ratio,num_comments,archived,author,distinguished,media
28297180,1590969599,gu9y0n,how are we supposed to know that compound 1 is...,Anyone feel like the OG makes huge logic leaps...,Mcat,1,1.0,5,True,Essie413,,
28297181,1590969599,gu9y0o,On A Plane/Champ will most likely be on WUNNA ...,,JuiceWRLD,193,0.99,8,True,Erwin1999,,{'reddit_video': {'dash_url': 'https://v.redd....
28297182,1590969599,gu9y0p,learn better grammar you incel,[deleted],memes,37,0.91,0,True,[deleted],,
28297183,1590969599,gu9y0q,"To anyone going protesting, or anyone in general",[deleted],FIU,17,0.72,17,True,[deleted],,
28297184,1590969599,gu9y0r,Can anything be done to protect the domain? Pe...,,Austin,0,0.33,5,True,Jublusion,,
