In [3]:
import pandas as pd

# Load the dataset with streaming
chunk_iter = pd.read_json('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=10000)

# Initialize a DataFrame for all categories
all_categories_df = pd.DataFrame()

# Process each chunk
for chunk in chunk_iter:
    # Extract unique categories from the 'categories' column
    unique_categories = chunk['categories'].str.split().explode().unique()
    temp_df = pd.DataFrame(unique_categories, columns=['category'])
    
    # Append to the overall DataFrame
    all_categories_df = pd.concat([all_categories_df, temp_df], ignore_index=True)

# Get the count of each unique category
category_counts = all_categories_df['category'].value_counts()

# Display the unique categories and their counts
print("Unique Categories:")
print(category_counts)


Unique Categories:
category
hep-th                255
cond-mat.stat-mech    255
quant-ph              255
nlin.CD               255
gr-qc                 255
                     ... 
mtrl-th                 5
supr-con                5
ao-sci                  3
acc-phys                2
cmp-lg                  1
Name: count, Length: 176, dtype: int64


In [4]:
all_categories_df.head()

Unnamed: 0,category
0,hep-ph
1,math.CO
2,cs.CG
3,physics.gen-ph
4,math.CA


In [3]:
# Get all unique categories from the dataset
unique_categories = filtered_df['categories'].str.split().explode().unique()

# Display the unique categories
print(unique_categories)


['hep-ph' 'math.CO' 'cs.CG' 'math.CA' 'math.FA' 'math.NT' 'math.AG'
 'math.AT' 'math.PR' 'nlin.PS' 'physics.chem-ph' 'q-bio.MN' 'math.NA'
 'math.RA' 'q-bio.PE' 'q-bio.CB' 'quant-ph' 'q-bio.QM' 'hep-lat' 'nucl-th'
 'math.OA' 'math.QA' 'math-ph' 'math.MP' 'cs.IT' 'math.IT' 'cs.NE' 'cs.AI'
 'math.DG' 'gr-qc' 'cs.DS' 'math.CV' 'math.DS' 'hep-ex' 'nucl-ex' 'hep-th'
 'math.RT' 'cs.CE' 'cond-mat.stat-mech' 'cs.MS' 'cs.NA' 'physics.data-an'
 'math.GR' 'math.AC' 'math.SG' 'cs.CC' 'math.KT' 'math.GT' 'math.AP'
 'q-bio.OT' 'astro-ph' 'physics.bio-ph' 'q-bio.BM' 'cs.DM' 'cs.LO'
 'physics.optics' 'math.MG' 'math.SP' 'math.ST' 'stat.TH' 'nlin.CD'
 'q-fin.CP' 'q-fin.PR' 'q-bio.NC' 'cond-mat.dis-nn' 'physics.soc-ph'
 'q-fin.RM' 'q-bio.SC' 'math.OC' 'cs.CR' 'math.CT' 'math.LO' 'cs.NI'
 'physics.gen-ph' 'cs.LG' 'physics.atom-ph' 'cs.PF' 'stat.ME' 'math.GM'
 'cs.SE' 'math.GN' 'stat.CO' 'cond-mat.str-el' 'cs.AR' 'cs.SC' 'stat.AP'
 'cond-mat.supr-con' 'stat.ML' 'nlin.SI' 'cs.CY' 'cs.IR' 'q-bio.GN'
 'cs.CV'

In [5]:
import pandas as pd

# Define the broader domains and their sub-categories
category_to_domain = {
    'cs': 'Computer Science',
    'hep-ph': 'Physics',
    'hep-ex': 'Physics',
    'math': 'Mathematics',
    'math.CO': 'Mathematics',
    'math.AP': 'Mathematics',
    'math.DG': 'Mathematics',
    'q-bio': 'Quantitative Biology',
    'q-fin': 'Quantitative Finance',
    # Add more mappings as needed
}

# Function to map categories to broader domains
def map_to_domain(categories):
    domains = set()
    for category in categories.split():
        domain = category_to_domain.get(category.split('.')[0], 'Other')
        domains.add(domain)
    return ', '.join(domains)

# Initialize a DataFrame for broader categories
broad_categories_df = pd.DataFrame()

# Stream and process the dataset
chunk_iter = pd.read_json('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=10000)
for chunk in chunk_iter:
    # Map categories to broader domains
    chunk['domain'] = chunk['categories'].apply(map_to_domain)
    
    # Append to the overall DataFrame
    broad_categories_df = pd.concat([broad_categories_df, chunk[['domain']]], ignore_index=True)

# Count occurrences of each domain
domain_counts = broad_categories_df['domain'].str.split(', ').explode().value_counts()

# Display the counts
print("Counts of Broader Domains:")
print(domain_counts)


Counts of Broader Domains:
domain
Other                   1497430
Computer Science         655845
Mathematics              654200
Physics                  210009
Quantitative Biology      46271
Quantitative Finance      19586
Name: count, dtype: int64


In [2]:
import pandas as pd

# Define the specific category you're interested in
target_category = 'q-fin'

# Function to filter the data
def filter_category(chunk):
    # Split categories and check if the target category is present
    chunk['main_category'] = chunk['categories'].apply(lambda x: next((cat.split('.')[0] for cat in x.split() if cat.split('.')[0] == target_category), None))
    # Drop rows where no matching category was found
    return chunk.dropna(subset=['main_category'])

# Initialize an empty DataFrame to store the filtered data
filtered_df = pd.DataFrame()

# Stream and filter the dataset
chunk_iter = pd.read_json('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', lines=True, chunksize=10000)
for chunk in chunk_iter:
    filtered_chunk = filter_category(chunk)
    filtered_df = pd.concat([filtered_df, filtered_chunk], ignore_index=True)




In [3]:
filtered_df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,main_category
0,704.0335,Fabien Panloup,"Gilles Pag\`es (PMA, LSProba), Fabien Panloup ...",Approximation of the distribution of a station...,,"Bernoulli 15, 1 (2009) 146-177",,,math.PR q-fin.CP q-fin.PR,http://arxiv.org/licenses/nonexclusive-distrib...,We build a sequence of empirical measures on...,"[{'version': 'v1', 'created': 'Tue, 3 Apr 2007...",2011-05-31,"[[Pagès, Gilles, , PMA, LSProba], [Panloup, Fa...",q-fin
1,704.0394,Anna Ja\'{s}kiewicz,Anna Ja\'skiewicz,Average optimality for risk-sensitive control ...,Published at http://dx.doi.org/10.1214/1050516...,"Annals of Applied Probability 2007, Vol. 17, N...",10.1214/105051606000000790,IMS-AAP-AAP410,q-fin.RM math.PR,,This paper deals with discrete-time Markov c...,"[{'version': 'v1', 'created': 'Tue, 3 Apr 2007...",2016-08-14,"[[Jaśkiewicz, Anna, ]]",q-fin
2,704.0567,Martin Keller-Ressel,"Martin Keller-Ressel, Thomas Steiner",Yield Curve Shapes and the Asymptotic Short Ra...,,,,,q-fin.PR math.PR,,"We consider a model for interest rates, wher...","[{'version': 'v1', 'created': 'Wed, 4 Apr 2007...",2008-12-02,"[[Keller-Ressel, Martin, ], [Steiner, Thomas, ]]",q-fin
3,704.0589,Wei-Xing Zhou,"Wei-Xing Zhou (ECUST), Didier Sornette (ETH Zu...",Analysis of the real estate market in Las Vega...,24 Elsart pages including 13 pages and 1 table,"Physica A 387 (1), 243-260 (2008)",10.1016/j.physa.2007.08.059,,q-fin.GN physics.soc-ph q-fin.ST,,We analyze 27 house price indexes of Las Veg...,"[{'version': 'v1', 'created': 'Wed, 4 Apr 2007...",2008-12-10,"[[Zhou, Wei-Xing, , ECUST], [Sornette, Didier,...",q-fin
4,704.0664,Jaroslaw Kwapien,"S. Drozdz, M. Forczek, J. Kwapien, P. Oswiecim...",Stock market return distributions: from past t...,to appear in Physica A,"Physica A 383, 59-64 (2007)",10.1016/j.physa.2007.04.130,,q-fin.ST physics.data-an physics.soc-ph,,We show that recent stock market fluctuation...,"[{'version': 'v1', 'created': 'Thu, 5 Apr 2007...",2009-11-13,"[[Drozdz, S., ], [Forczek, M., ], [Kwapien, J....",q-fin


In [5]:
import pandas as pd

# Assuming `filtered_df` is your DataFrame

# Rename 'id' to 'paper_id'
filtered_df = filtered_df.rename(columns={'id': 'paper_id'})

# Drop unnecessary columns and keep the specified ones
columns_to_keep = ['paper_id', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'license', 'abstract']
df_filter = filtered_df[columns_to_keep]

print(df_filter)


              paper_id                                            authors  \
0             704.0335  Gilles Pag\`es (PMA, LSProba), Fabien Panloup ...   
1             704.0394                                  Anna Ja\'skiewicz   
2             704.0567               Martin Keller-Ressel, Thomas Steiner   
3             704.0589  Wei-Xing Zhou (ECUST), Didier Sornette (ETH Zu...   
4             704.0664  S. Drozdz, M. Forczek, J. Kwapien, P. Oswiecim...   
...                ...                                                ...   
19621  physics/0703180                                     Luca Capriotti   
19622  physics/0703181                                     Luca Capriotti   
19623  physics/0703201  Arnab Chatterjee, Sitabhra Sinha, Bikas K. Cha...   
19624  physics/0703208              Pawe{\l} Sieczka, Janusz A. Ho{\l}yst   
19625  physics/0703217  D. Sornette (ETH Zurich) and V.F. Pisarenko (R...   

                                                   title  \
0      Approxim

In [6]:
import pandas as pd


#df = pd.DataFrame(data)

# Save DataFrame to JSON
df_filter.to_json('q-fin_filtered_data.json', orient='records', lines=True)


Performing json splitting before loading to mongo


In [8]:
import json
import os

def split_json_file(input_file, output_dir, chunk_size):
    with open(input_file, 'r') as f:
        data = [json.loads(line) for line in f]
    
    # Determine the number of chunks needed
    num_chunks = (len(data) + chunk_size - 1) // chunk_size
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for i in range(num_chunks):
        chunk_data = data[i * chunk_size:(i + 1) * chunk_size]
        chunk_file_path = os.path.join(output_dir, f"q-fin_filtered_chunk_{i + 1}.json")
        
        with open(chunk_file_path, 'w') as chunk_file:
            json.dump(chunk_data, chunk_file, indent=4)
    
    print(f"Data has been split into {num_chunks} chunks and saved in {output_dir}")

# Split the JSON file into chunks
input_file = "/kaggle/working/q-fin_filtered_data.json"
output_dir = "fin_filtered_data_chunks"
chunk_size = 1000  # Adjust as needed

split_json_file(input_file, output_dir, chunk_size)


Data has been split into 20 chunks and saved in fin_filtered_data_chunks


In [10]:
import os
import zipfile

def zip_dir(dir_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(dir_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, dir_path)
                zipf.write(file_path, arcname)
dir_to_zip = '/kaggle/working/fin_data_chunked'
zip_file_path = '/kaggle/working/fin_data_chunks.zip'
zip_dir(dir_to_zip, zip_file_path)

print(f"Directory '{dir_to_zip}' has been compressed to '{zip_file_path}'.")


Directory '/kaggle/working/fin_data_chunked' has been compressed to '/kaggle/working/fin_data_chunks.zip'.


In [9]:
import os
import zipfile

def zip_dir(dir_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(dir_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, dir_path)
                zipf.write(file_path, arcname)
dir_to_zip = '/kaggle/working/fin_filtered_data_chunks'
zip_file_path = '/kaggle/working/fin_filtered_data_chunks1.zip'
zip_dir(dir_to_zip, zip_file_path)

print(f"Directory '{dir_to_zip}' has been compressed to '{zip_file_path}'.")


Directory '/kaggle/working/fin_filtered_data_chunks' has been compressed to '/kaggle/working/fin_filtered_data_chunks1.zip'.
