In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import concurrent.futures

In [2]:
def remove_quotes_from_file(path):
    try:
        # Indicate that processing of the file has started
        print(f"Processing file '{path}'...")
        
        # Open the file with a specified encoding to handle special characters
        with open(path, 'r', encoding='utf-8', errors='replace') as file:
            # Read file content and replace single quotes
            content = file.read().replace("'", "")
        
        # Write modified content back to the file
        with open(path, 'w', encoding='utf-8') as file:
            file.write(content)
        
        # Indicate successful cleaning
        print(f"File '{path}' has been cleaned.")

    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e} for file {path}")
    except Exception as e:
        print(f"An error occurred: {e} for file {path}")

In [3]:
# Function to process files in parallel
def process_files_in_parallel(folder_path):
    # List all .txt files in the folder
    file_names = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    file_paths = [os.path.join(folder_path, file_name) for file_name in file_names]

    # Indicate that parallel processing is starting
    print(f"Starting parallel processing for {len(file_paths)} files...")

    # Use ThreadPoolExecutor to process files in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Map the function to the list of file paths
        executor.map(remove_quotes_from_file, file_paths)
    
    # Indicate that processing is complete
    print("Parallel processing complete.")


In [4]:
folder_path = "."
process_files_in_parallel(folder_path) #this is to clean the file

Starting parallel processing for 1001 files...
Processing file '.\page_1.txt'...
Processing file '.\page_10.txt'...
Processing file '.\page_100.txt'...
Processing file '.\page_1000.txt'...
Processing file '.\page_1001.txt'...
Processing file '.\page_101.txt'...
Processing file '.\page_102.txt'...
Processing file '.\page_103.txt'...
Processing file '.\page_104.txt'...
Processing file '.\page_105.txt'...
Processing file '.\page_106.txt'...
Processing file '.\page_107.txt'...
Processing file '.\page_108.txt'...
Processing file '.\page_109.txt'...
Processing file '.\page_11.txt'...
Processing file '.\page_110.txt'...
File '.\page_106.txt' has been cleaned.
Processing file '.\page_111.txt'...
File '.\page_103.txt' has been cleaned.
Processing file '.\page_112.txt'...
File '.\page_1.txt' has been cleaned.
Processing file '.\page_113.txt'...
File '.\page_107.txt' has been cleaned.
Processing file '.\page_114.txt'...
File '.\page_108.txt' has been cleaned.
Processing file '.\page_115.txt'...
F

File '.\page_187.txt' has been cleaned.
Processing file '.\page_202.txt'...
File '.\page_190.txt' has been cleaned.
Processing file '.\page_203.txt'...
File '.\page_195.txt' has been cleaned.
Processing file '.\page_204.txt'...
File '.\page_188.txt' has been cleaned.
Processing file '.\page_205.txt'...
File '.\page_192.txt' has been cleaned.
Processing file '.\page_206.txt'...
File '.\page_193.txt' has been cleaned.
Processing file '.\page_207.txt'...
File '.\page_194.txt' has been cleaned.
Processing file '.\page_208.txt'...
File '.\page_191.txt' has been cleaned.
Processing file '.\page_209.txt'...
File '.\page_2.txt' has been cleaned.
Processing file '.\page_21.txt'...
File '.\page_199.txt' has been cleaned.
Processing file '.\page_210.txt'...
File '.\page_198.txt' has been cleaned.
Processing file '.\page_211.txt'...
File '.\page_200.txt' has been cleaned.
Processing file '.\page_212.txt'...
File '.\page_197.txt' has been cleaned.
Processing file '.\page_213.txt'...
File '.\page_19

File '.\page_288.txt' has been cleaned.
Processing file '.\page_302.txt'...
File '.\page_292.txt' has been cleaned.
Processing file '.\page_303.txt'...
File '.\page_290.txt' has been cleaned.
Processing file '.\page_304.txt'...
File '.\page_291.txt' has been cleaned.
Processing file '.\page_305.txt'...
File '.\page_29.txt' has been cleaned.
Processing file '.\page_306.txt'...
File '.\page_297.txt' has been cleaned.
Processing file '.\page_307.txt'...
File '.\page_299.txt' has been cleaned.
Processing file '.\page_308.txt'...
File '.\page_294.txt' has been cleaned.
Processing file '.\page_309.txt'...
File '.\page_298.txt' has been cleaned.
Processing file '.\page_31.txt'...
File '.\page_30.txt' has been cleaned.
Processing file '.\page_310.txt'...
File '.\page_293.txt' has been cleaned.
Processing file '.\page_311.txt'...
File '.\page_296.txt' has been cleaned.
Processing file '.\page_312.txt'...
File '.\page_3.txt' has been cleaned.File '.\page_300.txt' has been cleaned.
Processing fil

File '.\page_39.txt' has been cleaned.
Processing file '.\page_400.txt'...
File '.\page_386.txt' has been cleaned.
Processing file '.\page_401.txt'...
File '.\page_387.txt' has been cleaned.
Processing file '.\page_402.txt'...
File '.\page_391.txt' has been cleaned.
Processing file '.\page_403.txt'...
File '.\page_394.txt' has been cleaned.
Processing file '.\page_404.txt'...
File '.\page_390.txt' has been cleaned.
Processing file '.\page_405.txt'...
File '.\page_380.txt' has been cleaned.
Processing file '.\page_406.txt'...
File '.\page_395.txt' has been cleaned.
Processing file '.\page_407.txt'...
File '.\page_393.txt' has been cleaned.
Processing file '.\page_408.txt'...
File '.\page_396.txt' has been cleaned.
Processing file '.\page_409.txt'...
File '.\page_392.txt' has been cleaned.
Processing file '.\page_41.txt'...
File '.\page_399.txt' has been cleaned.
Processing file '.\page_410.txt'...
File '.\page_40.txt' has been cleaned.
Processing file '.\page_411.txt'...
File '.\page_4.

File '.\page_489.txt' has been cleaned.
Processing file '.\page_502.txt'...
File '.\page_49.txt' has been cleaned.
Processing file '.\page_503.txt'...
File '.\page_491.txt' has been cleaned.
Processing file '.\page_504.txt'...
File '.\page_490.txt' has been cleaned.
Processing file '.\page_505.txt'...
File '.\page_492.txt' has been cleaned.
Processing file '.\page_506.txt'...
File '.\page_497.txt' has been cleaned.
Processing file '.\page_507.txt'...
File '.\page_495.txt' has been cleaned.
Processing file '.\page_508.txt'...
File '.\page_494.txt' has been cleaned.
Processing file '.\page_509.txt'...
File '.\page_50.txt' has been cleaned.
Processing file '.\page_51.txt'...
File '.\page_493.txt' has been cleaned.
Processing file '.\page_510.txt'...
File '.\page_496.txt' has been cleaned.
Processing file '.\page_511.txt'...
File '.\page_500.txt' has been cleaned.
Processing file '.\page_512.txt'...
File '.\page_499.txt' has been cleaned.
Processing file '.\page_513.txt'...
File '.\page_5.

File '.\page_59.txt' has been cleaned.
Processing file '.\page_600.txt'...
File '.\page_592.txt' has been cleaned.
Processing file '.\page_601.txt'...
File '.\page_583.txt' has been cleaned.
Processing file '.\page_602.txt'...
File '.\page_584.txt' has been cleaned.
Processing file '.\page_603.txt'...
File '.\page_582.txt' has been cleaned.
Processing file '.\page_604.txt'...
File '.\page_591.txt' has been cleaned.
Processing file '.\page_605.txt'...
File '.\page_590.txt' has been cleaned.
Processing file '.\page_606.txt'...
File '.\page_593.txt' has been cleaned.
Processing file '.\page_607.txt'...
File '.\page_597.txt' has been cleaned.
Processing file '.\page_608.txt'...
File '.\page_595.txt' has been cleaned.
Processing file '.\page_609.txt'...
File '.\page_598.txt' has been cleaned.
Processing file '.\page_61.txt'...
File '.\page_599.txt' has been cleaned.
Processing file '.\page_610.txt'...
File '.\page_600.txt' has been cleaned.
Processing file '.\page_611.txt'...
File '.\page_5

File '.\page_692.txt' has been cleaned.
Processing file '.\page_703.txt'...
File '.\page_690.txt' has been cleaned.
Processing file '.\page_704.txt'...
File '.\page_691.txt' has been cleaned.
Processing file '.\page_705.txt'...
File '.\page_684.txt' has been cleaned.
Processing file '.\page_706.txt'...
File '.\page_686.txt' has been cleaned.
Processing file '.\page_707.txt'...
File '.\page_695.txt' has been cleaned.
Processing file '.\page_708.txt'...
File '.\page_694.txt' has been cleaned.
Processing file '.\page_709.txt'...
File '.\page_697.txt' has been cleaned.
Processing file '.\page_71.txt'...
File '.\page_701.txt' has been cleaned.
Processing file '.\page_710.txt'...
File '.\page_696.txt' has been cleaned.
Processing file '.\page_711.txt'...
File '.\page_703.txt' has been cleaned.
Processing file '.\page_712.txt'...
File '.\page_705.txt' has been cleaned.
Processing file '.\page_713.txt'...
File '.\page_707.txt' has been cleaned.
Processing file '.\page_714.txt'...
File '.\page_

File '.\page_784.txt' has been cleaned.
Processing file '.\page_803.txt'...
File '.\page_793.txt' has been cleaned.
Processing file '.\page_804.txt'...
File '.\page_792.txt' has been cleaned.
Processing file '.\page_805.txt'...
File '.\page_791.txt' has been cleaned.
Processing file '.\page_806.txt'...
File '.\page_795.txt' has been cleaned.
Processing file '.\page_807.txt'...
File '.\page_789.txt' has been cleaned.
Processing file '.\page_808.txt'...
File '.\page_796.txt' has been cleaned.
Processing file '.\page_809.txt'...
File '.\page_797.txt' has been cleaned.
Processing file '.\page_81.txt'...
File '.\page_798.txt' has been cleaned.
Processing file '.\page_810.txt'...
File '.\page_799.txt' has been cleaned.
Processing file '.\page_811.txt'...
File '.\page_8.txt' has been cleaned.
Processing file '.\page_812.txt'...
File '.\page_801.txt' has been cleaned.
Processing file '.\page_813.txt'...
File '.\page_802.txt' has been cleaned.
Processing file '.\page_814.txt'...
File '.\page_80

File '.\page_888.txt' has been cleaned.File '.\page_891.txt' has been cleaned.
Processing file '.\page_904.txt'...

Processing file '.\page_905.txt'...
File '.\page_894.txt' has been cleaned.
Processing file '.\page_906.txt'...
File '.\page_896.txt' has been cleaned.
Processing file '.\page_907.txt'...
File '.\page_889.txt' has been cleaned.
Processing file '.\page_908.txt'...
File '.\page_895.txt' has been cleaned.
Processing file '.\page_909.txt'...
File '.\page_9.txt' has been cleaned.
Processing file '.\page_91.txt'...
File '.\page_901.txt' has been cleaned.
Processing file '.\page_910.txt'...
File '.\page_900.txt' has been cleaned.
Processing file '.\page_911.txt'...
File '.\page_898.txt' has been cleaned.
Processing file '.\page_912.txt'...
File '.\page_897.txt' has been cleaned.
Processing file '.\page_913.txt'...
File '.\page_90.txt' has been cleaned.
Processing file '.\page_914.txt'...
File '.\page_902.txt' has been cleaned.
Processing file '.\page_915.txt'...
File '.\page_905

File '.\page_992.txt' has been cleaned.
File '.\page_991.txt' has been cleaned.
File '.\page_993.txt' has been cleaned.
File '.\page_995.txt' has been cleaned.
File '.\page_996.txt' has been cleaned.
File '.\page_998.txt' has been cleaned.
File '.\page_994.txt' has been cleaned.
File '.\page_999.txt' has been cleaned.
File '.\page_997.txt' has been cleaned.
Parallel processing complete.


In [5]:
#old code (does manual(one at a time)nworks)
#for file_name in os.listdir("."):                                         
 #   if file_name.endswith(".txt"):
  #      file_number = file_name.split('_')[1].split('.')[0]
   #     remove_quotes_from_file(file_name)
    #    print(f"File '{file_name}' has been cleaned.")