In [1]:
# Convert Khmer Stopwords from Excel to Text File

# Import required libraries
import pandas as pd
import os

def convert_excel_to_txt():
    # Define file paths
    input_file = 'khmer stopwords-corpus-385.xlsx'
    output_file = 'khmer_stopwords.txt'

    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found.")
        input_file = input("Please enter the correct path to the Excel file: ")

    try:
        # Load the Excel file
        df = pd.read_excel(input_file)
        print(f"Successfully loaded Excel file with {len(df)} rows.")
        print("\nFirst 5 rows:")
        print(df.head())

        # Process the stopwords (assuming first column contains stopwords)
        stopwords_column = df.columns[0]
        stopwords = df[stopwords_column].dropna().astype(str).tolist()
        
        print(f"Extracted {len(stopwords)} stopwords from column '{stopwords_column}'")
        print("\nFirst 10 stopwords:")
        for i, word in enumerate(stopwords[:10]):
            print(f"{i+1}. {word}")

        # Save to text file
        with open(output_file, 'w', encoding='utf-8') as f:
            for word in stopwords:
                f.write(f"{word}\n")
        
        print(f"\nSuccessfully saved {len(stopwords)} stopwords to '{output_file}'")

        # Verify the output
        with open(output_file, 'r', encoding='utf-8') as f:
            saved_stopwords = f.read().splitlines()
        
        print(f"Verification: Text file contains {len(saved_stopwords)} stopwords")
        
        if len(saved_stopwords) == len(stopwords):
            print("✅ Number of stopwords matches the original count.")
        else:
            print("❌ Number of stopwords does not match the original count.")
        
        print("\nFirst 10 stopwords from the text file:")
        for i, word in enumerate(saved_stopwords[:10]):
            print(f"{i+1}. {word}")
            
        return True
            
    except Exception as e:
        print(f"Error: {e}")
        return False

if __name__ == "__main__":
    convert_excel_to_txt()

Successfully loaded Excel file with 384 rows.

First 5 rows:
   បន្ថែម_x000D_
0       \tខណៈនោះ
1     \tខាងក្រោម
2  \tគ្រប់គ្រាន់
3     \tដោយសារតែ
4     \tនៅពេលនោះ
Extracted 384 stopwords from column 'បន្ថែម_x000D_'

First 10 stopwords:
1. 	ខណៈនោះ
2. 	ខាងក្រោម
3. 	គ្រប់គ្រាន់
4. 	ដោយសារតែ
5. 	នៅពេលនោះ
6. 	ប្រទះឃើញ
7. 	ម្យ៉ាងទៀត
8. 	យ៉ាងខាប់
9. 	លើក
10. ​ប្រាំ

Successfully saved 384 stopwords to 'khmer_stopwords.txt'
Verification: Text file contains 409 stopwords
❌ Number of stopwords does not match the original count.

First 10 stopwords from the text file:
1. 	ខណៈនោះ
2. 	ខាងក្រោម
3. 	គ្រប់គ្រាន់
4. 	ដោយសារតែ
5. 	នៅពេលនោះ
6. 	ប្រទះឃើញ
7. 	ម្យ៉ាងទៀត
8. 	យ៉ាងខាប់
9. 	លើក
10. ​ប្រាំ


In [4]:
# Clean Khmer stopwords file
# Remove spaces, _x000D_ characters, and empty lines

def clean_stopwords_file():
    input_file = 'khmer_stopwords.txt'
    temp_file = 'khmer_stopwords_clean.txt'
    
    try:
        # Read the original file
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Clean the lines
        cleaned_lines = []
        for line in lines:
            # Remove _x000D_ and strip whitespace
            cleaned_line = line.replace('_x000D_', '').strip()
            
            # Only keep non-empty lines
            if cleaned_line:
                cleaned_lines.append(cleaned_line)
        
        # Write the cleaned content back to file
        with open(temp_file, 'w', encoding='utf-8') as f:
            for line in cleaned_lines:
                f.write(f"{line}\n")
        
        # Optional: replace the original file
        import os
        os.replace(temp_file, input_file)
        
        print(f"Successfully cleaned {len(lines)} lines to {len(cleaned_lines)} lines.")
        print(f"Removed {len(lines) - len(cleaned_lines)} empty lines.")
        print(f"Removed _x000D_ characters and spaces.")
        
    except Exception as e:
        print(f"Error: {e}")
        return False
    
    return True

if __name__ == "__main__":
    clean_stopwords_file()

Successfully cleaned 384 lines to 384 lines.
Removed 0 empty lines.
Removed _x000D_ characters and spaces.
