In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import numpy as np

import pandas as pd
import json
import ijson
import os

In [None]:
with open("dblp.v12.json", "r", encoding="utf-8") as f:
    for i in range(5):  
        print(f.readline().strip())



[
{"id":1091,"authors":[{"name":"Makoto Satoh","org":"Shinshu University","id":2312688602},{"name":"Ryo Muramatsu","org":"Shinshu University","id":2482909946},{"name":"Mizue Kayama","org":"Shinshu University","id":2128134587},{"name":"Kazunori Itoh","org":"Shinshu University","id":2101782692},{"name":"Masami Hashimoto","org":"Shinshu University","id":2114054191},{"name":"Makoto Otani","org":"Shinshu University","id":1989208940},{"name":"Michio Shimizu","org":"Nagano Prefectural College","id":2134989941},{"name":"Masahiko Sugimoto","org":"Takushoku University, Hokkaido Junior College","id":2307479915}],"title":"Preliminary Design of a Network Protocol Learning Tool Based on the Comprehension of High School Students: Design by an Empirical Study Using a Simple Mind Map","year":2013,"n_citation":1,"page_start":"89","page_end":"93","doc_type":"Conference","publisher":"Springer, Berlin, Heidelberg","volume":"","issue":"","doi":"10.1007/978-3-642-39476-8_19","references":[2005687710,20180372

In [None]:
import pandas as pd
import json
import ijson
import os

def process_segment(start_line, end_line, input_file, output_file, first_chunk=True):
    """Process a segment of the JSON file between start_line and end_line"""
    
    def join_list(values):
        if isinstance(values, list):
            return ", ".join(map(str, values))
        return ""

    useful_columns = [
        "id", "title", "year", "n_citation", "page_start", "page_end",
        "doc_type", "publisher", "volume", "issue", "doi", "authors",
        "references", "fos", "venue.raw"
    ]
    
    chunk_size = 1000
    current_chunk = []
    current_line = 0
    
    with open(input_file, "rb") as file:
        parser = ijson.items(file, "item")
        
        for i, item in enumerate(parser, 1):
            if i < start_line:
                continue
            if i > end_line:
                break
                
            current_chunk.append(item)
            current_line += 1
            
            if len(current_chunk) >= chunk_size:
                # Convert to DataFrame and process
                chunk_df = pd.DataFrame(current_chunk)
            
                existing_columns = [col for col in useful_columns if col in chunk_df.columns]
                chunk_df = chunk_df[existing_columns].copy()
                
                # Filter years
                if "year" in chunk_df.columns:
                    chunk_df = chunk_df[chunk_df["year"] <= 2005]
                
                if not chunk_df.empty:
                    if "authors" in chunk_df.columns:
                        chunk_df["authors"] = chunk_df["authors"].apply(join_list)
                    if "references" in chunk_df.columns:
                        chunk_df["references"] = chunk_df["references"].apply(join_list)
                    if "fos" in chunk_df.columns:
                        chunk_df["fos"] = chunk_df["fos"].apply(join_list)
                    
                    if "venue.raw" in chunk_df.columns:
                        chunk_df["venue"] = chunk_df["venue.raw"].fillna("")
                        chunk_df.drop(columns=["venue.raw"], inplace=True)
                    
                    chunk_df.to_csv(output_file, mode="a", index=False, header=first_chunk)
                    first_chunk = False
                
                current_chunk = []
                print(f"Processed up to line {i}...")
    
        # Process final chunk if any
        if current_chunk:
            chunk_df = pd.DataFrame(current_chunk)
            if not chunk_df.empty:
                existing_columns = [col for col in useful_columns if col in chunk_df.columns]
                chunk_df = chunk_df[existing_columns].copy()
                
                if "year" in chunk_df.columns:
                    chunk_df = chunk_df[chunk_df["year"] <= 2005]
                
                if not chunk_df.empty:
                    if "authors" in chunk_df.columns:
                        chunk_df["authors"] = chunk_df["authors"].apply(join_list)
                    if "references" in chunk_df.columns:
                        chunk_df["references"] = chunk_df["references"].apply(join_list)
                    if "fos" in chunk_df.columns:
                        chunk_df["fos"] = chunk_df["fos"].apply(join_list)
                    
                    if "venue.raw" in chunk_df.columns:
                        chunk_df["venue"] = chunk_df["venue.raw"].fillna("")
                        chunk_df.drop(columns=["venue.raw"], inplace=True)
                    
                    chunk_df.to_csv(output_file, mode="a", index=False, header=first_chunk)

    return current_line

def main():
    input_file = "dblp.v12.json"
    base_output_file = "output_filtered"
    lines_per_segment = 1_000_000
    
    checkpoint_file = "processing_checkpoint.txt"
    start_line = 1
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            start_line = int(f.read().strip())
            print(f"Resuming from line {start_line}")
    
    segment_number = (start_line - 1) // lines_per_segment + 1
    
    while True:
        end_line = start_line + lines_per_segment - 1
        output_file = f"{base_output_file}_segment_{segment_number}.csv"
        
        print(f"\nProcessing segment {segment_number}: lines {start_line} to {end_line}")
        
        lines_processed = process_segment(
            start_line,
            end_line,
            input_file,
            output_file,
            first_chunk=(start_line == 1)
        )
        
        with open(checkpoint_file, "w") as f:
            f.write(str(start_line + lines_processed))
        
        if lines_processed < lines_per_segment:
            print("\nReached end of file")
            break
            
        start_line += lines_per_segment
        segment_number += 1
        
    print("\nAll segments have been processed!")
    
    combine = input("Would you like to combine all segments into a single file? (y/n): ")
    if combine.lower() == 'y':
        final_output = f"{base_output_file}_complete.csv"
        first_segment = True
        
        for i in range(1, segment_number + 1):
            segment_file = f"{base_output_file}_segment_{i}.csv"
            if os.path.exists(segment_file):
                pd.read_csv(segment_file).to_csv(
                    final_output,
                    mode='a',
                    header=first_segment,
                    index=False
                )
                first_segment = False
                
        print(f"\nAll segments combined into {final_output}")

if __name__ == "__main__":
    main()


Processing segment 1: lines 1 to 1000000
Processed up to line 1000...
Processed up to line 2000...
Processed up to line 3000...
Processed up to line 4000...
Processed up to line 5000...
Processed up to line 6000...
Processed up to line 7000...
Processed up to line 8000...
Processed up to line 9000...
Processed up to line 10000...
Processed up to line 11000...
Processed up to line 12000...
Processed up to line 13000...
Processed up to line 14000...
Processed up to line 15000...
Processed up to line 16000...
Processed up to line 17000...
Processed up to line 18000...
Processed up to line 19000...
Processed up to line 20000...
Processed up to line 21000...
Processed up to line 22000...
Processed up to line 23000...
Processed up to line 24000...
Processed up to line 25000...
Processed up to line 26000...
Processed up to line 27000...
Processed up to line 28000...
Processed up to line 29000...
Processed up to line 30000...
Processed up to line 31000...
Processed up to line 32000...
Process

  pd.read_csv(segment_file).to_csv(
  pd.read_csv(segment_file).to_csv(



All segments combined into output_filtered_complete.csv


In [None]:
# as explained in the report, we use output_filtered_segment_1.csv, output_filtered_segment_2.csv ...
# because we had a memory out of range problem because the ram is onky 16 GB

# so we do the treatment in chunks, and finally combine the result in output_filtered_complete.csv file