In [1]:
import pandas as pd
import os

In [None]:
files_map = {
    '../data/raw/voa_khmer_rights_final.csv': 'right',
    '../data/raw/voa_khmer_environment_final.csv': 'environment',
    '../data/raw/voa_khmer_science_final.csv': 'science',
    '../data/raw/voa_khmer_education_final.csv': 'education',
    '../data/raw/voa_khmer_education_2_final.csv': 'education',
    '../data/raw/voa_khmer_culture_final.csv': 'culture'
}

final_columns = [
    'text', 'culture', 'economic', 'education',
    'environment', 'health', 'politics', 'right', 'science'
]

dataframes = []

print("üöÄ Starting processing...")

for file_path, category_column in files_map.items():
    if os.path.exists(file_path):
        print(f"   Processing: {os.path.basename(file_path)} -> Label: {category_column}")


        df = pd.read_csv(file_path)

        initial_count = len(df)
        df = df[~df['content'].isin(['No content found', 'Error: No content found (Structure changed?)'])]
        df = df.dropna(subset=['content'])

        print(f"      - Cleaned: {initial_count} rows -> {len(df)} rows")

        temp_df = pd.DataFrame()

        temp_df['text'] = df['content']

        for col in final_columns:
            if col != 'text':
                temp_df[col] = 0

        temp_df[category_column] = 1

        temp_df = temp_df[final_columns]

        dataframes.append(temp_df)
    else:
        print(f"‚ùå File not found: {file_path}")

if dataframes:
    final_df = pd.concat(dataframes, ignore_index=True)

    # Save to a new CSV
    output_path = '../data/raw/merged_khmer_dataset.csv'
    final_df.to_csv(output_path, index=False, encoding='utf-8-sig')

    print(f"\n‚úÖ Success! Merged file saved at: {output_path}")
    print(f"üìä Total Rows: {len(final_df)}")
    print("Preview of the data:")
    print(final_df.sample(5))
else:
    print("\n‚ö†Ô∏è No dataframes were processed. Check your file paths.")

üöÄ Starting processing...
   Processing: voa_khmer_rights_final.csv -> Label: right
      - Cleaned: 150 rows -> 147 rows
   Processing: voa_khmer_environment_final.csv -> Label: environment
      - Cleaned: 250 rows -> 224 rows
   Processing: voa_khmer_science_final.csv -> Label: science
      - Cleaned: 566 rows -> 384 rows
   Processing: voa_khmer_education_final.csv -> Label: education
      - Cleaned: 500 rows -> 383 rows
   Processing: voa_khmer_education_2_final.csv -> Label: education
      - Cleaned: 250 rows -> 177 rows
   Processing: voa_khmer_culture_final.csv -> Label: culture
      - Cleaned: 600 rows -> 463 rows

‚úÖ Success! Merged file saved at: /content/drive/MyDrive/ITC/WR/KhmerTextClassification/dataset/merged_khmer_dataset.csv
üìä Total Rows: 1778
Preview of the data:
                                                   text  culture  economic  \
1337  ·ûó·üí·ûì·üÜ·ûñ·üÅ·ûâ ‚Äî ·ûÄ·üí·ûö·üÑ·ûò‚Äã·ûÄ·û∑·ûÖ·üí·ûÖ‚Äã·ûü·û†·ûî·üí·ûö·ûè·û∑·ûî·ûè·üí·ûè·û∑‚Äã·ûÄ·û∂·ûö‚Äã