In [2]:
import os
import json
import pandas as pd
import re

# Define paths
ARTICLES_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/articles'
# Store output directory outside the FYP-data-prep folder
OUTPUT_DIR = '/Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles'
TEXTS_DIR = os.path.join(OUTPUT_DIR, 'texts')
METADATA_PATH = os.path.join(OUTPUT_DIR, 'metadata.csv')

# Create output directories if they don't exist
os.makedirs(TEXTS_DIR, exist_ok=True)

def count_words(text):
  """Count the number of words in a text."""
  # Remove special characters and digits
  words = re.findall(r'\b\w+\b', text)
  return len(words)

def process_articles():
  """Process all article JSON files and create metadata and text files."""
  # List to store all article metadata
  metadata = []
  index = 1
  
  # Get all JSON files in the articles directory
  json_files = [f for f in os.listdir(ARTICLES_DIR) if f.endswith('.json')]
  
  # Process each JSON file
  for json_file in json_files:
    category = os.path.splitext(json_file)[0]  # Get category from filename
    category_counter = 1  # Initialize counter for each category
    
    # Load the JSON file
    with open(os.path.join(ARTICLES_DIR, json_file), 'r', encoding='utf-8') as f:
      articles = json.load(f)
    
    # Process each article in the JSON file
    for article in articles:
      # Extract required fields
      title = article.get('title', '')
      content = article.get('content', '')
      url = article.get('url', '')
      
      # Create docId in the format {category}+{numberOrder}
      doc_id = f"{category}{category_counter}"
      category_counter += 1
      
      # Calculate counts
      char_count = len(content)
      word_count = count_words(content)
      
      # Create text file with title and content
      text_path = os.path.join(TEXTS_DIR, f"{doc_id}.txt")
      with open(text_path, 'w', encoding='utf-8') as f:
        f.write(f"{title}\n\n{content}")
      
      # Add to metadata
      metadata.append({
        'index': index,
        'docId': doc_id,
        'category': category,
        'charCount': char_count,
        'wordCount': word_count,
        'url': url
      })
      
      index += 1
  
  # Create metadata DataFrame and save to CSV
  metadata_df = pd.DataFrame(metadata)
  metadata_df.to_csv(METADATA_PATH, index=False)
  
  print(f"Processed {len(metadata)} articles")
  print(f"Metadata saved to {METADATA_PATH}")
  print(f"Text files saved to {TEXTS_DIR}")
  
  return metadata_df

# Execute the processing function
if __name__ == "__main__":
  metadata_df = process_articles()
  
  # Display the first few rows of the metadata
  display(metadata_df.head())
  
  # Display statistics per category
  category_stats = metadata_df.groupby('category').agg({
    'index': 'count',
    'charCount': ['mean', 'min', 'max'],
    'wordCount': ['mean', 'min', 'max']
  })
  display(category_stats)


Processed 15000 articles
Metadata saved to /Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/metadata.csv
Text files saved to /Users/socheata/Documents/FYP-Khmer-Classification/orginal_articles/texts


Unnamed: 0,index,docId,category,charCount,wordCount,url
0,1,health1,health,2320,846,https://kohsantepheapdaily.com.kh/article/1133...
1,2,health2,health,2760,1021,https://kohsantepheapdaily.com.kh/article/1445...
2,3,health3,health,739,267,https://kohsantepheapdaily.com.kh/article/2020...
3,4,health4,health,1454,541,https://kohsantepheapdaily.com.kh/article/1431...
4,5,health5,health,1360,495,https://kohsantepheapdaily.com.kh/article/%e1%...


Unnamed: 0_level_0,index,charCount,charCount,charCount,wordCount,wordCount,wordCount
Unnamed: 0_level_1,count,mean,min,max,mean,min,max
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
economic,2500,2525.5612,0,15850,928.1688,0,5684
environment,2500,3795.5372,14,12787,1410.1984,5,4724
health,2500,1958.3284,158,9568,724.836,54,3499
politic,2500,2055.7012,0,47108,754.016,0,7888
sport,2500,1913.0244,284,8489,689.0028,94,3177
technology,2500,1663.1184,0,26378,579.7176,0,8187
