In [4]:
# Convert Khmer Stopwords from CSV to Text File

import pandas as pd
import os

def convert_csv_to_txt():
    input_file = "Khmer-Stop-Word-1000.csv"
    output_file = "Khmer-Stop-Word-1000.txt"

    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found.")
        input_file = input("Please enter the correct path to the CSV file: ")

    try:
        # Load the CSV file
        df = pd.read_csv(input_file)
        print(f"Successfully loaded CSV file with {len(df)} rows.")
        print("\nFirst 5 rows:")
        print(df.head())

        # Process the stopwords (assuming first column contains stopwords)
        stopwords_column = df.columns[0]
        stopwords = df[stopwords_column].dropna().astype(str).tolist()
        
        print(f"Extracted {len(stopwords)} stopwords from column '{stopwords_column}'")
        print("\nFirst 10 stopwords:")
        for i, word in enumerate(stopwords[:10]):
            print(f"{i+1}. {word}")

        # Save to text file
        with open(output_file, 'w', encoding='utf-8') as f:
            for word in stopwords:
                f.write(f"{word}\n")
        
        print(f"\nSuccessfully saved {len(stopwords)} stopwords to '{output_file}'")

        # Verify the output
        with open(output_file, 'r', encoding='utf-8') as f:
            saved_stopwords = f.read().splitlines()
        
        print(f"Verification: Text file contains {len(saved_stopwords)} stopwords")
        
        if len(saved_stopwords) == len(stopwords):
            print("✅ Number of stopwords matches the original count.")
        else:
            print("❌ Number of stopwords does not match the original count.")
        
        print("\nFirst 10 stopwords from the text file:")
        for i, word in enumerate(saved_stopwords[:10]):
            print(f"{i+1}. {word}")
            
        return True
            
    except Exception as e:
        print(f"Error: {e}")
        return False

if __name__ == "__main__":
    convert_csv_to_txt()


Successfully loaded CSV file with 1040 rows.

First 5 rows:
        this
0         ក៏
1    ក៏ត្រូវ
2  ក៏ប៉ុន្តែ
3        កើត
4     កើតឡើង
Extracted 1040 stopwords from column 'this'

First 10 stopwords:
1. ក៏
2. ក៏ត្រូវ
3. ក៏ប៉ុន្តែ
4. កើត
5. កើតឡើង
6. ក្បែរ
7. ក្បែរនេះ
8. កំពុង
9. កំពុងតែ
10. កាន់

Successfully saved 1040 stopwords to 'Khmer-Stop-Word-1000.txt'
Verification: Text file contains 1040 stopwords
✅ Number of stopwords matches the original count.

First 10 stopwords from the text file:
1. ក៏
2. ក៏ត្រូវ
3. ក៏ប៉ុន្តែ
4. កើត
5. កើតឡើង
6. ក្បែរ
7. ក្បែរនេះ
8. កំពុង
9. កំពុងតែ
10. កាន់


In [5]:
# Clean Khmer stopwords file
# Remove spaces, _x000D_ characters, and empty lines

def clean_stopwords_file():
    input_file = 'khmer_stopwords.txt'
    temp_file = 'khmer_stopwords_clean.txt'
    
    try:
        # Read the original file
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Clean the lines
        cleaned_lines = []
        for line in lines:
            # Remove _x000D_ and strip whitespace
            cleaned_line = line.replace('_x000D_', '').strip()
            
            # Only keep non-empty lines
            if cleaned_line:
                cleaned_lines.append(cleaned_line)
        
        # Write the cleaned content back to file
        with open(temp_file, 'w', encoding='utf-8') as f:
            for line in cleaned_lines:
                f.write(f"{line}\n")
        
        # Optional: replace the original file
        import os
        os.replace(temp_file, input_file)
        
        print(f"Successfully cleaned {len(lines)} lines to {len(cleaned_lines)} lines.")
        print(f"Removed {len(lines) - len(cleaned_lines)} empty lines.")
        print(f"Removed _x000D_ characters and spaces.")
        
    except Exception as e:
        print(f"Error: {e}")
        return False
    
    return True

if __name__ == "__main__":
    clean_stopwords_file()

Successfully cleaned 384 lines to 384 lines.
Removed 0 empty lines.
Removed _x000D_ characters and spaces.
