In [2]:
import pandas as pd


In [None]:
AskScience_posts = pd.read_json('jsons/r_AskScience_posts.jsonl', lines=True)
AskScience_comments = pd.read_json('jsons/r_AskScience_comments.jsonl', lines=True)
Medicine_posts = pd.read_json('jsons/r_Medicine_posts.jsonl', lines=True)
Medicine_comments = pd.read_json('jsons/r_Medicine_comments.jsonl', lines=True)
Psychology_posts = pd.read_json('jsons/r_Psychology_posts.jsonl', lines=True)
Psychology_comments = pd.read_json('jsons/r_Psychology_comments.jsonl', lines=True)



In [7]:
import os
import pandas as pd
import re

# Path to the folder containing JSONL files
json_folder = 'jsons'

# Lists to store DataFrames
post_dfs = []
comment_dfs = []

# Dictionary to store DataFrames by subreddit for reference
dfs_by_subreddit = {}

# Process each file in the folder
for filename in os.listdir(json_folder):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(json_folder, filename)
        
        # Extract subreddit name and type (posts or comments) using regex
        match = re.match(r'r_(\w+)_(posts|comments)\.jsonl', filename)
        
        if match:
            subreddit = match.group(1)
            content_type = match.group(2)
            
            print(f"Processing {filename}...")
            
            try:
                # Read JSONL directly with pandas
                df = pd.read_json(file_path, lines=True)
                
                # Store in appropriate list based on content type
                if content_type == 'posts':
                    post_dfs.append(df)
                    dfs_by_subreddit[f"{subreddit}_posts"] = df
                elif content_type == 'comments':
                    comment_dfs.append(df)
                    dfs_by_subreddit[f"{subreddit}_comments"] = df
                
                print(f"  Added {len(df)} rows from {filename}")
                
            except Exception as e:
                print(f"  Error processing {filename}: {str(e)}")
                print(f"  Trying alternative method...")
                
                try:
                    # Fallback method with encoding handling
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        df = pd.read_json(f, lines=True)
                    
                    if content_type == 'posts':
                        post_dfs.append(df)
                        dfs_by_subreddit[f"{subreddit}_posts"] = df
                    elif content_type == 'comments':
                        comment_dfs.append(df)
                        dfs_by_subreddit[f"{subreddit}_comments"] = df
                    
                    print(f"  Added {len(df)} rows from {filename} using fallback method")
                    
                except Exception as e2:
                    print(f"  Failed with fallback method: {str(e2)}")

# Function to merge DataFrames
def merge_dataframes(dfs, output_file):
    if not dfs:
        print(f"No DataFrames to merge for {output_file}")
        return None
    
    # Concatenate all DataFrames
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Save the merged DataFrame to CSV
    merged_df.to_csv(output_file, index=False)
    print(f"Successfully created {output_file} with {len(merged_df)} rows")
    
    return merged_df

# Create variables for individual DataFrames that might be referenced later
for key, df in dfs_by_subreddit.items():
    # This will create variables like AskScience_posts, Medicine_comments, etc.
    exec(f"{key} = df")

# Merge post DataFrames and save to CSV
merged_posts = merge_dataframes(post_dfs, 'merged_posts.csv')

# Merge comment DataFrames and save to CSV
merged_comments = merge_dataframes(comment_dfs, 'merged_comments.csv')

# Output summary
if merged_posts is not None:
    print("Merged posts shape:", merged_posts.shape)
if merged_comments is not None:
    print("Merged comments shape:", merged_comments.shape)

# List all the DataFrames that were created
print("\nCreated DataFrames:")
for key in dfs_by_subreddit.keys():
    print(f"- {key}")

Processing r_AskAcademia_comments.jsonl...
  Added 185066 rows from r_AskAcademia_comments.jsonl
Processing r_AskAcademia_posts.jsonl...
  Added 18281 rows from r_AskAcademia_posts.jsonl
Processing r_AskScience_comments.jsonl...
  Added 130940 rows from r_AskScience_comments.jsonl
Processing r_AskScience_posts.jsonl...
  Added 95440 rows from r_AskScience_posts.jsonl
Processing r_Medicine_comments.jsonl...
  Added 297694 rows from r_Medicine_comments.jsonl
Processing r_Medicine_posts.jsonl...
  Added 19474 rows from r_Medicine_posts.jsonl
Processing r_Psychology_comments.jsonl...
  Added 160289 rows from r_Psychology_comments.jsonl
Processing r_Psychology_posts.jsonl...
  Added 12277 rows from r_Psychology_posts.jsonl
Processing r_research_posts.jsonl...
  Added 7681 rows from r_research_posts.jsonl
Processing r_science_comments.jsonl...
  Added 1238728 rows from r_science_comments.jsonl
Processing r_science_posts.jsonl...
  Added 24984 rows from r_science_posts.jsonl
Successfully crea